1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cfloat>
8 #include <chrono>
9 #include <cmath>
10 #include <functional>
11 #include <limits>
12 #include <mutex>
13 #include <random>
14 #include <vector>
15
16 #include <cpuinfo.h>
17
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24 #include <xnnpack/AlignedAllocator.h>
25 #include <xnnpack/common.h>
26 #include <xnnpack/gemm.h>
27 #include <xnnpack/pack.h>
28 #include <xnnpack/params-init.h>
29 #include <xnnpack/params.h>
30
GEMMBenchmark(benchmark::State & state,xnn_qs8_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr,bool extended_weights=false)31 static void GEMMBenchmark(benchmark::State& state,
32 xnn_qs8_gemm_minmax_ukernel_function gemm,
33 size_t mr, size_t nr, size_t kr, size_t sr,
34 xnn_init_qs8_conv_minmax_params_fn init_params,
35 benchmark::utils::IsaCheckFunction isa_check = nullptr,
36 bool extended_weights = false)
37 {
38 if (!cpuinfo_initialize()) {
39 state.SkipWithError("cpuinfo initialization failed");
40 return;
41 }
42 if (isa_check && !isa_check(state)) {
43 return;
44 }
45
46 const size_t mc = state.range(0);
47 const size_t nc = state.range(1);
48 const size_t kc = state.range(2);
49
50 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
51 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
52
53 std::random_device random_device;
54 auto rng = std::mt19937(random_device());
55 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
56 auto i8rng = std::bind(
57 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
58
59 std::vector<int8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(int8_t));
60 std::generate(a.begin(), a.end(), std::ref(i8rng));
61 std::vector<int8_t> k(nc * kc);
62 std::generate(k.begin(), k.end(), std::ref(i8rng));
63 std::vector<int32_t> b(nc);
64 std::generate(b.begin(), b.end(), std::ref(i32rng));
65
66 const size_t w_element_size = extended_weights ? sizeof(int16_t) : sizeof(int8_t);
67 const size_t w_size = nc_stride * sizeof(int32_t) + kc_stride * nc_stride * w_element_size;
68 const size_t c_elements = mc * nc;
69 const size_t num_buffers = 1 +
70 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), w_size + c_elements * sizeof(int8_t));
71
72 std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
73 std::fill(w.begin(), w.end(), 0);
74 const xnn_qs8_packing_params packing_params = { 127 };
75 if (extended_weights) {
76 xnn_pack_qs8_gemm_xw_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
77 } else {
78 xnn_pack_qs8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
79 }
80 std::vector<int8_t> c(c_elements * num_buffers);
81 std::fill(c.begin(), c.end(), 0xA5);
82
83 union xnn_qs8_conv_minmax_params quantization_params;
84 init_params(&quantization_params, 0.75f, 127, -127, 126);
85
86 size_t buffer_index = 0;
87 for (auto _ : state) {
88 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
89 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
90 // - W is not in cache (for any cache level)
91 // - C is not in cache (for any cache level)
92 state.PauseTiming();
93 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
94 buffer_index = (buffer_index + 1) % num_buffers;
95 state.ResumeTiming();
96
97 for (uint32_t m = 0; m < mc; m += mr) {
98 const uint32_t mb = min(mc - m, mr);
99 for (uint32_t n = 0; n < nc; n += nr) {
100 const uint32_t nb = min(nc - n, nr);
101 gemm(
102 mb, nb, kc * sizeof(int8_t),
103 a.data() + m * kc, kc * sizeof(int8_t),
104 w.data() + w_size * buffer_index + n * (kc_stride * w_element_size + sizeof(int32_t)),
105 c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(int8_t), nr * sizeof(int8_t),
106 &quantization_params);
107 }
108 }
109 }
110
111 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
112 if (cpu_frequency != 0) {
113 state.counters["cpufreq"] = cpu_frequency;
114 }
115
116 state.counters["OPS"] = benchmark::Counter(
117 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
118 }
119
120 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)121 static void RuyBenchmark(benchmark::State& state, size_t threads)
122 {
123 const size_t mc = state.range(0);
124 const size_t nc = state.range(1);
125 const size_t kc = state.range(2);
126
127 std::random_device random_device;
128 auto rng = std::mt19937(random_device());
129 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
130 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
131
132 const size_t num_buffers = 1 +
133 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
134 nc * (sizeof(int8_t) * (mc + kc) + sizeof(int32_t)));
135
136 std::vector<int8_t> a(mc * kc);
137 std::generate(a.begin(), a.end(), std::ref(u8rng));
138 std::vector<int8_t> k(num_buffers * nc * kc);
139 std::generate(k.begin(), k.end(), std::ref(u8rng));
140 std::vector<int32_t> b(num_buffers * nc);
141 std::generate(b.begin(), b.end(), std::ref(i32rng));
142 std::vector<int8_t> c(num_buffers * nc * mc);
143 std::fill(c.begin(), c.end(), std::nanf(""));
144
145 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
146 static ruy::Context context;
147 context.set_max_num_threads(threads);
148
149 ruy::Matrix<int8_t> ruy_a;
150 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
151 ruy_a.set_zero_point(127);
152 ruy::Matrix<int8_t> ruy_b;
153 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
154 ruy_b.set_data(a.data());
155 ruy_b.set_zero_point(127);
156 ruy::Matrix<int8_t> ruy_c;
157 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
158 ruy_c.set_zero_point(127);
159
160 ruy::MulParams<int32_t, int8_t> mul_params;
161 mul_params.set_multiplier_fixedpoint(0x40000000);
162
163 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
164 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
165 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
166 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
167 static std::once_flag warmup;
168 std::call_once(warmup, [&](){
169 auto start = std::chrono::steady_clock::now();
170 do {
171 ruy_a.set_data(k.data());
172 ruy_c.set_data(c.data());
173 mul_params.set_bias(b.data());
174
175 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
176 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
177 });
178
179 size_t buffer_index = 0;
180 for (auto _ : state) {
181 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
182 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
183 // - K is not in cache (for any cache level)
184 // - B is not in cache (for any cache level)
185 // - C is not in cache (for any cache level)
186 state.PauseTiming();
187 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
188 buffer_index = (buffer_index + 1) % num_buffers;
189 state.ResumeTiming();
190
191 ruy_a.set_data(k.data() + buffer_index * nc * kc);
192 ruy_c.set_data(c.data() + buffer_index * mc * nc);
193 mul_params.set_bias(b.data() + buffer_index * nc);
194
195 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
196 }
197
198 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
199 if (cpu_frequency != 0) {
200 state.counters["cpufreq"] = cpu_frequency;
201 }
202
203 state.counters["OPS"] = benchmark::Counter(
204 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
205 }
206
ruy_st(benchmark::State & state,const char * net)207 static void ruy_st(benchmark::State& state, const char* net)
208 {
209 RuyBenchmark(state, 1);
210 }
211 #endif // BENCHMARK_RUY
212
213 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)214 static void GEMMBenchmark(benchmark::State& state,
215 xnn_jit_gemm_code_generator_function generator,
216 size_t mr, size_t nr, size_t kr, size_t sr,
217 xnn_init_qs8_conv_minmax_params_fn init_params,
218 benchmark::utils::IsaCheckFunction isa_check = nullptr)
219 {
220 xnn_code_buffer code_buffer;
221 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
222 const size_t nc = state.range(1);
223 const size_t kc = state.range(2);
224 generator(&code_buffer, nc, kc, nullptr);
225 GEMMBenchmark(
226 state,
227 reinterpret_cast<xnn_qs8_gemm_minmax_ukernel_function>(code_buffer.code),
228 mr, nr, kr, sr, init_params, isa_check);
229 xnn_release_code_memory(&code_buffer);
230 }
231
jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,const char * net)232 static void jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
233 GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
234 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
235 }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)236 static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
237 GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
238 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
239 }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)240 static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
241 GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
242 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
243 }
244 BENCHMARK_GEMM(jit_qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)245 BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
246 BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
247 #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
248
249 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
250 static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
251 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
252 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
253 }
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,const char * net)254 static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, const char* net) {
255 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55, 4, 8, 4, 1,
256 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
257 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)258 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
259 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, 4, 8, 1, 1,
260 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
261 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)262 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
263 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, 4, 8, 1, 1,
264 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
265 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)266 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
267 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, 4, 8, 1, 1,
268 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
269 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)270 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
271 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, 4, 8, 1, 1,
272 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
273 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)274 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
275 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
276 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
277 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)278 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
279 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
280 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
281 }
282 BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)283 BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
284 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
285 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
286 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
287 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
288 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
289 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
290 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
291
292 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
293 static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
294 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1,
295 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
296 }
qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)297 static void qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
298 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32, 1, 16, 4, 1,
299 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
300 }
qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)301 static void qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
302 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64, 1, 16, 4, 1,
303 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
304 }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)305 static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
306 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32, 4, 16, 4, 1,
307 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
308 }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)309 static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
310 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64, 4, 16, 4, 1,
311 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
312 }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)313 static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
314 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, 4, 16, 4, 1,
315 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
316 }
qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)317 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
318 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, 4, 8, 1, 1,
319 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
320 }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)321 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
322 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
323 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
324 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)325 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
326 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, 4, 16, 1, 1,
327 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
328 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)329 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
330 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, 4, 16, 1, 1,
331 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
332 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)333 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
334 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, 4, 16, 1, 1,
335 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
336 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)337 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
338 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, 4, 16, 1, 1,
339 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
340 }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)341 static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
342 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, 1, 8, 8, 1,
343 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
344 }
qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)345 static void qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
346 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, 1, 8, 8, 1,
347 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
348 }
qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)349 static void qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
350 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, 1, 8, 8, 1,
351 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
352 }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)353 static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
354 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, 1, 8, 8, 1,
355 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
356 }
qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State & state,const char * net)357 static void qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State& state, const char* net) {
358 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, 2, 8, 8, 1,
359 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
360 }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)361 static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
362 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, 2, 8, 8, 1,
363 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
364 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)365 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
366 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, 2, 8, 8, 1,
367 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
368 }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)369 static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
370 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, 2, 8, 8, 1,
371 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
372 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)373 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
374 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, 2, 8, 8, 1,
375 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
376 }
qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State & state,const char * net)377 static void qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State& state, const char* net) {
378 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, 2, 8, 16, 1,
379 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
380 }
381
382 BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld32)
BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)383 BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)
384 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld32)
385 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld64)
386 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld128)
387 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
388 BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
389 BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
390 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
391 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
392 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
393 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
394 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm)
395 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal)
396 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53)
397 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53)
398 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mull)
399 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal)
400 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
401 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
402 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
403 BENCHMARK_GEMM(qs8_gemm_2x8c16__aarch64_neon_mlal)
404 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
405
406
407 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
408 static void qs8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
409 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, 1, 8, 1, 1,
410 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
411 }
qs8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)412 static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
413 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, 2, 8, 1, 1,
414 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
415 }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)416 static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
417 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, 3, 8, 1, 1,
418 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
419 }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)420 static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
421 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, 4, 8, 1, 1,
422 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
423 }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)424 static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
425 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, 6, 8, 1, 1,
426 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
427 }
qs8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)428 static void qs8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
429 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, 1, 16, 1, 1,
430 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
431 }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)432 static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
433 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, 2, 16, 1, 1,
434 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
435 }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)436 static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
437 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, 3, 16, 1, 1,
438 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
439 }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)440 static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
441 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, 4, 16, 1, 1,
442 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
443 }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)444 static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
445 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, 6, 16, 1, 1,
446 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
447 }
qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)448 static void qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
449 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm, 1, 8, 1, 1,
450 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
451 }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)452 static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
453 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm, 2, 8, 1, 1,
454 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
455 }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)456 static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
457 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, 3, 8, 1, 1,
458 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
459 }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)460 static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
461 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm, 4, 8, 1, 1,
462 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
463 }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)464 static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
465 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, 6, 8, 1, 1,
466 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
467 }
qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)468 static void qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
469 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, 1, 16, 1, 1,
470 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
471 }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)472 static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
473 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, 2, 16, 1, 1,
474 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
475 }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)476 static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
477 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm, 3, 16, 1, 1,
478 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
479 }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)480 static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
481 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm, 4, 16, 1, 1,
482 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
483 }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)484 static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
485 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm, 6, 16, 1, 1,
486 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
487 }
qs8_gemm_1x8c2__neon_mull_dup(benchmark::State & state,const char * net)488 static void qs8_gemm_1x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
489 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, 1, 8, 2, 1,
490 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
491 }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,const char * net)492 static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
493 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, 2, 8, 2, 1,
494 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
495 }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,const char * net)496 static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
497 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, 3, 8, 2, 1,
498 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
499 }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,const char * net)500 static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
501 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, 4, 8, 2, 1,
502 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
503 }
qs8_gemm_1x16c2__neon_mull_dup(benchmark::State & state,const char * net)504 static void qs8_gemm_1x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
505 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, 1, 16, 2, 1,
506 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
507 }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,const char * net)508 static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
509 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, 2, 16, 2, 1,
510 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
511 }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,const char * net)512 static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
513 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, 3, 16, 2, 1,
514 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
515 }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,const char * net)516 static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
517 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, 4, 16, 2, 1,
518 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
519 }
qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State & state,const char * net)520 static void qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
521 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, 1, 8, 2, 1,
522 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
523 }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,const char * net)524 static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
525 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, 2, 8, 2, 1,
526 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
527 }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,const char * net)528 static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
529 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, 3, 8, 2, 1,
530 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
531 }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,const char * net)532 static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
533 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, 4, 8, 2, 1,
534 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
535 }
qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State & state,const char * net)536 static void qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
537 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, 1, 16, 2, 1,
538 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
539 }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,const char * net)540 static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
541 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, 2, 16, 2, 1,
542 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
543 }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,const char * net)544 static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
545 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, 3, 16, 2, 1,
546 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
547 }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,const char * net)548 static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
549 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, 4, 16, 2, 1,
550 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
551 }
qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)552 static void qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
553 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, 1, 8, 2, 1,
554 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
555 }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)556 static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
557 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, 2, 8, 2, 1,
558 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
559 }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)560 static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
561 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, 3, 8, 2, 1,
562 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
563 }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)564 static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
565 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, 4, 8, 2, 1,
566 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
567 }
qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)568 static void qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
569 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r, 1, 16, 2, 1,
570 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
571 }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)572 static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
573 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r, 2, 16, 2, 1,
574 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
575 }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)576 static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
577 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r, 3, 16, 2, 1,
578 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
579 }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)580 static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
581 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, 4, 16, 2, 1,
582 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
583 }
qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)584 static void qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
585 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, 1, 8, 2, 1,
586 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
587 }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)588 static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
589 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, 2, 8, 2, 1,
590 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
591 }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)592 static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
593 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, 3, 8, 2, 1,
594 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
595 }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)596 static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
597 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, 4, 8, 2, 1,
598 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
599 }
qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)600 static void qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
601 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, 1, 16, 2, 1,
602 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
603 }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)604 static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
605 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r, 2, 16, 2, 1,
606 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
607 }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)608 static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
609 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r, 3, 16, 2, 1,
610 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
611 }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)612 static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
613 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r, 4, 16, 2, 1,
614 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
615 }
qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)616 static void qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
617 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, 1, 8, 2, 1,
618 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
619 }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)620 static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
621 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r, 2, 8, 2, 1,
622 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
623 }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)624 static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
625 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r, 3, 8, 2, 1,
626 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
627 }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)628 static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
629 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r, 4, 8, 2, 1,
630 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
631 }
qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)632 static void qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
633 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, 1, 16, 2, 1,
634 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
635 }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)636 static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
637 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, 2, 16, 2, 1,
638 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
639 }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)640 static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
641 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, 3, 16, 2, 1,
642 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
643 }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)644 static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
645 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, 4, 16, 2, 1,
646 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
647 }
qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)648 static void qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
649 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, 1, 8, 2, 1,
650 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
651 }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)652 static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
653 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r, 2, 8, 2, 1,
654 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
655 }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)656 static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
657 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r, 3, 8, 2, 1,
658 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
659 }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)660 static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
661 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r, 4, 8, 2, 1,
662 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
663 }
qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)664 static void qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
665 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, 1, 16, 2, 1,
666 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
667 }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)668 static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
669 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, 2, 16, 2, 1,
670 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
671 }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)672 static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
673 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, 3, 16, 2, 1,
674 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
675 }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)676 static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
677 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, 4, 16, 2, 1,
678 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
679 }
qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)680 static void qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
681 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, 1, 8, 2, 1,
682 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
683 }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)684 static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
685 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, 2, 8, 2, 1,
686 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
687 }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)688 static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
689 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r, 3, 8, 2, 1,
690 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
691 }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)692 static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
693 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r, 4, 8, 2, 1,
694 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
695 }
qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)696 static void qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
697 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r, 1, 16, 2, 1,
698 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
699 }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)700 static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
701 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r, 2, 16, 2, 1,
702 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
703 }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)704 static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
705 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r, 3, 16, 2, 1,
706 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
707 }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)708 static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
709 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r, 4, 16, 2, 1,
710 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
711 }
qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)712 static void qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
713 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, 1, 8, 2, 1,
714 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
715 }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)716 static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
717 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r, 2, 8, 2, 1,
718 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
719 }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)720 static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
721 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r, 3, 8, 2, 1,
722 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
723 }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)724 static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
725 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r, 4, 8, 2, 1,
726 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
727 }
qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)728 static void qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
729 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, 1, 16, 2, 1,
730 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
731 }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)732 static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
733 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, 2, 16, 2, 1,
734 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
735 }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)736 static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
737 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, 3, 16, 2, 1,
738 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
739 }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)740 static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
741 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r, 4, 16, 2, 1,
742 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
743 }
qs8_gemm_1x8c2s4__neon_mull(benchmark::State & state,const char * net)744 static void qs8_gemm_1x8c2s4__neon_mull(benchmark::State& state, const char* net) {
745 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, 1, 8, 2, 4,
746 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
747 }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,const char * net)748 static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, const char* net) {
749 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, 2, 8, 2, 4,
750 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
751 }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,const char * net)752 static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, const char* net) {
753 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, 3, 8, 2, 4,
754 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
755 }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,const char * net)756 static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, const char* net) {
757 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, 4, 8, 2, 4,
758 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
759 }
qs8_gemm_1x16c2s4__neon_mull(benchmark::State & state,const char * net)760 static void qs8_gemm_1x16c2s4__neon_mull(benchmark::State& state, const char* net) {
761 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, 1, 16, 2, 4,
762 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
763 }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,const char * net)764 static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, const char* net) {
765 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, 2, 16, 2, 4,
766 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
767 }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,const char * net)768 static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, const char* net) {
769 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, 3, 16, 2, 4,
770 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
771 }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,const char * net)772 static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, const char* net) {
773 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, 4, 16, 2, 4,
774 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
775 }
qs8_gemm_1x8c2s4__neon_mlal(benchmark::State & state,const char * net)776 static void qs8_gemm_1x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
777 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, 1, 8, 2, 4,
778 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
779 }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,const char * net)780 static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
781 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, 2, 8, 2, 4,
782 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
783 }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,const char * net)784 static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
785 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, 3, 8, 2, 4,
786 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
787 }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,const char * net)788 static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
789 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, 4, 8, 2, 4,
790 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
791 }
qs8_gemm_1x16c2s4__neon_mlal(benchmark::State & state,const char * net)792 static void qs8_gemm_1x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
793 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, 1, 16, 2, 4,
794 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
795 }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,const char * net)796 static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
797 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, 2, 16, 2, 4,
798 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
799 }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,const char * net)800 static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
801 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, 3, 16, 2, 4,
802 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
803 }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,const char * net)804 static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
805 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, 4, 16, 2, 4,
806 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
807 }
qs8_gemm_1x8c4__neon_mull_dup(benchmark::State & state,const char * net)808 static void qs8_gemm_1x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
809 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, 1, 8, 4, 1,
810 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
811 }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,const char * net)812 static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
813 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, 2, 8, 4, 1,
814 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
815 }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,const char * net)816 static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
817 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, 3, 8, 4, 1,
818 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
819 }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,const char * net)820 static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
821 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, 4, 8, 4, 1,
822 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
823 }
qs8_gemm_1x16c4__neon_mull_dup(benchmark::State & state,const char * net)824 static void qs8_gemm_1x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
825 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, 1, 16, 4, 1,
826 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
827 }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,const char * net)828 static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
829 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, 2, 16, 4, 1,
830 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
831 }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,const char * net)832 static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
833 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, 3, 16, 4, 1,
834 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
835 }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,const char * net)836 static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
837 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, 4, 16, 4, 1,
838 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
839 }
qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State & state,const char * net)840 static void qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
841 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, 1, 8, 4, 1,
842 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
843 }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,const char * net)844 static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
845 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, 2, 8, 4, 1,
846 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
847 }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,const char * net)848 static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
849 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, 3, 8, 4, 1,
850 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
851 }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,const char * net)852 static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
853 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, 4, 8, 4, 1,
854 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
855 }
qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State & state,const char * net)856 static void qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
857 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, 1, 16, 4, 1,
858 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
859 }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,const char * net)860 static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
861 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, 2, 16, 4, 1,
862 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
863 }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,const char * net)864 static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
865 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, 3, 16, 4, 1,
866 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
867 }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,const char * net)868 static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
869 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, 4, 16, 4, 1,
870 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
871 }
qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)872 static void qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
873 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, 1, 8, 4, 1,
874 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
875 }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)876 static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
877 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, 2, 8, 4, 1,
878 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
879 }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)880 static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
881 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, 3, 8, 4, 1,
882 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
883 }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)884 static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
885 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, 4, 8, 4, 1,
886 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
887 }
qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)888 static void qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
889 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r, 1, 16, 4, 1,
890 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
891 }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)892 static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
893 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r, 2, 16, 4, 1,
894 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
895 }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)896 static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
897 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r, 3, 16, 4, 1,
898 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
899 }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)900 static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
901 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, 4, 16, 4, 1,
902 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
903 }
qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)904 static void qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
905 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, 1, 8, 4, 1,
906 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
907 }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)908 static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
909 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, 2, 8, 4, 1,
910 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
911 }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)912 static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
913 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, 3, 8, 4, 1,
914 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
915 }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)916 static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
917 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, 4, 8, 4, 1,
918 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
919 }
qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)920 static void qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
921 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r, 1, 16, 4, 1,
922 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
923 }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)924 static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
925 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r, 2, 16, 4, 1,
926 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
927 }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)928 static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
929 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r, 3, 16, 4, 1,
930 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
931 }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)932 static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
933 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, 4, 16, 4, 1,
934 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
935 }
qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)936 static void qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
937 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, 1, 8, 4, 1,
938 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
939 }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)940 static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
941 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r, 2, 8, 4, 1,
942 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
943 }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)944 static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
945 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r, 3, 8, 4, 1,
946 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
947 }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)948 static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
949 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r, 4, 8, 4, 1,
950 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
951 }
qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)952 static void qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
953 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, 1, 16, 4, 1,
954 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
955 }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)956 static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
957 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, 2, 16, 4, 1,
958 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
959 }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)960 static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
961 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, 3, 16, 4, 1,
962 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
963 }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)964 static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
965 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, 4, 16, 4, 1,
966 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
967 }
qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)968 static void qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
969 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, 1, 8, 4, 1,
970 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
971 }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)972 static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
973 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r, 2, 8, 4, 1,
974 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
975 }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)976 static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
977 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r, 3, 8, 4, 1,
978 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
979 }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)980 static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
981 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r, 4, 8, 4, 1,
982 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
983 }
qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)984 static void qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
985 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, 1, 16, 4, 1,
986 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
987 }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)988 static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
989 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, 2, 16, 4, 1,
990 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
991 }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)992 static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
993 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, 3, 16, 4, 1,
994 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
995 }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)996 static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
997 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, 4, 16, 4, 1,
998 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
999 }
qs8_gemm_1x8c8__neon_mull(benchmark::State & state,const char * net)1000 static void qs8_gemm_1x8c8__neon_mull(benchmark::State& state, const char* net) {
1001 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull, 1, 8, 8, 1,
1002 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1003 }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,const char * net)1004 static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, const char* net) {
1005 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull, 2, 8, 8, 1,
1006 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1007 }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,const char * net)1008 static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, const char* net) {
1009 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull, 3, 8, 8, 1,
1010 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1011 }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,const char * net)1012 static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, const char* net) {
1013 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull, 4, 8, 8, 1,
1014 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1015 }
qs8_gemm_1x16c8__neon_mull(benchmark::State & state,const char * net)1016 static void qs8_gemm_1x16c8__neon_mull(benchmark::State& state, const char* net) {
1017 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull, 1, 16, 8, 1,
1018 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1019 }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,const char * net)1020 static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, const char* net) {
1021 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull, 2, 16, 8, 1,
1022 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1023 }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,const char * net)1024 static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, const char* net) {
1025 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull, 3, 16, 8, 1,
1026 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1027 }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,const char * net)1028 static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, const char* net) {
1029 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull, 4, 16, 8, 1,
1030 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1031 }
qs8_gemm_1x8c8__neon_mlal(benchmark::State & state,const char * net)1032 static void qs8_gemm_1x8c8__neon_mlal(benchmark::State& state, const char* net) {
1033 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, 1, 8, 8, 1,
1034 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1035 }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,const char * net)1036 static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, const char* net) {
1037 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, 2, 8, 8, 1,
1038 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1039 }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,const char * net)1040 static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, const char* net) {
1041 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, 3, 8, 8, 1,
1042 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1043 }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,const char * net)1044 static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, const char* net) {
1045 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, 4, 8, 8, 1,
1046 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1047 }
qs8_gemm_1x16c8__neon_mlal(benchmark::State & state,const char * net)1048 static void qs8_gemm_1x16c8__neon_mlal(benchmark::State& state, const char* net) {
1049 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal, 1, 16, 8, 1,
1050 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1051 }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,const char * net)1052 static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, const char* net) {
1053 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal, 2, 16, 8, 1,
1054 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1055 }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,const char * net)1056 static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, const char* net) {
1057 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, 3, 16, 8, 1,
1058 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1059 }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,const char * net)1060 static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, const char* net) {
1061 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, 4, 16, 8, 1,
1062 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1063 }
qs8_gemm_1x8c16__neon_mlal(benchmark::State & state,const char * net)1064 static void qs8_gemm_1x8c16__neon_mlal(benchmark::State& state, const char* net) {
1065 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, 1, 8, 16, 1,
1066 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1067 }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,const char * net)1068 static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, const char* net) {
1069 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, 2, 8, 16, 1,
1070 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1071 }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,const char * net)1072 static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, const char* net) {
1073 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, 3, 8, 16, 1,
1074 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1075 }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,const char * net)1076 static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, const char* net) {
1077 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, 4, 8, 16, 1,
1078 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1079 }
qs8_gemm_1x16c16__neon_mlal(benchmark::State & state,const char * net)1080 static void qs8_gemm_1x16c16__neon_mlal(benchmark::State& state, const char* net) {
1081 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, 1, 16, 16, 1,
1082 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1083 }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,const char * net)1084 static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, const char* net) {
1085 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, 2, 16, 16, 1,
1086 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1087 }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,const char * net)1088 static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, const char* net) {
1089 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal, 3, 16, 16, 1,
1090 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1091 }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,const char * net)1092 static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, const char* net) {
1093 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal, 4, 16, 16, 1,
1094 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1095 }
qs8_gemm_1x8c4__neondot(benchmark::State & state,const char * net)1096 static void qs8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
1097 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, 1, 8, 4, 1,
1098 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1099 }
qs8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)1100 static void qs8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
1101 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, 4, 8, 4, 1,
1102 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1103 }
qs8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)1104 static void qs8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
1105 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, 6, 8, 4, 1,
1106 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1107 }
qs8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)1108 static void qs8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
1109 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, 8, 8, 4, 1,
1110 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1111 }
qs8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)1112 static void qs8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
1113 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, 1, 16, 4, 1,
1114 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1115 }
qs8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)1116 static void qs8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
1117 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, 4, 16, 4, 1,
1118 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1119 }
qs8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)1120 static void qs8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
1121 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, 6, 16, 4, 1,
1122 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1123 }
qs8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)1124 static void qs8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
1125 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
1126 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1127 }
1128 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_dup)
BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)1129 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)
1130 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_dup)
1131 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_dup)
1132 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_dup)
1133 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_dup)
1134 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_dup)
1135 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_dup)
1136 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_dup)
1137 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_dup)
1138 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_dup)
1139 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_dup)
1140 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_dup)
1141 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_dup)
1142 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_dup)
1143 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_dup)
1144 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld1r)
1145 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld1r)
1146 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld1r)
1147 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld1r)
1148 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld1r)
1149 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld1r)
1150 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld1r)
1151 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld1r)
1152 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld1r)
1153 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld1r)
1154 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld1r)
1155 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld1r)
1156 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld1r)
1157 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld1r)
1158 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld1r)
1159 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld1r)
1160 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld2r)
1161 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld2r)
1162 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld2r)
1163 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld2r)
1164 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld2r)
1165 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld2r)
1166 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld2r)
1167 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld2r)
1168 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld2r)
1169 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld2r)
1170 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld2r)
1171 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld2r)
1172 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld2r)
1173 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld2r)
1174 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld2r)
1175 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld2r)
1176 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_dup)
1177 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_dup)
1178 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_dup)
1179 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_dup)
1180 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_dup)
1181 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_dup)
1182 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_dup)
1183 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_dup)
1184 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_dup)
1185 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_dup)
1186 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_dup)
1187 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_dup)
1188 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_dup)
1189 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_dup)
1190 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_dup)
1191 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_dup)
1192 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld1r)
1193 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld1r)
1194 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld1r)
1195 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld1r)
1196 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld1r)
1197 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld1r)
1198 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld1r)
1199 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld1r)
1200 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld1r)
1201 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld1r)
1202 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld1r)
1203 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld1r)
1204 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld1r)
1205 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld1r)
1206 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld1r)
1207 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld1r)
1208 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld2r)
1209 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld2r)
1210 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld2r)
1211 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld2r)
1212 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld2r)
1213 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld2r)
1214 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld2r)
1215 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld2r)
1216 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld2r)
1217 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld2r)
1218 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld2r)
1219 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld2r)
1220 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld2r)
1221 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld2r)
1222 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld2r)
1223 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld2r)
1224 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld4r)
1225 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld4r)
1226 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld4r)
1227 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld4r)
1228 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld4r)
1229 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld4r)
1230 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld4r)
1231 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld4r)
1232 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld4r)
1233 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld4r)
1234 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld4r)
1235 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld4r)
1236 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld4r)
1237 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld4r)
1238 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld4r)
1239 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld4r)
1240 BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mull)
1241 BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mull)
1242 BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mull)
1243 BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mull)
1244 BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mull)
1245 BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mull)
1246 BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mull)
1247 BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mull)
1248 BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mlal)
1249 BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mlal)
1250 BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mlal)
1251 BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mlal)
1252 BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mlal)
1253 BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mlal)
1254 BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mlal)
1255 BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mlal)
1256 BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane)
1257 BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane)
1258 BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane)
1259 BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
1260 BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane)
1261 BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane)
1262 BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
1263 BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane)
1264 BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane)
1265 BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane)
1266 BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane_prfm)
1267 BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane_prfm)
1268 BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane_prfm)
1269 BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane_prfm)
1270 BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane_prfm)
1271 BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane_prfm)
1272 BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane_prfm)
1273 BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane_prfm)
1274 BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane_prfm)
1275 BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane_prfm)
1276 BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mull)
1277 BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mull)
1278 BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mull)
1279 BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mull)
1280 BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mull)
1281 BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mull)
1282 BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mull)
1283 BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mull)
1284 BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mlal)
1285 BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mlal)
1286 BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mlal)
1287 BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mlal)
1288 BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mlal)
1289 BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mlal)
1290 BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mlal)
1291 BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mlal)
1292 BENCHMARK_GEMM(qs8_gemm_1x8c16__neon_mlal)
1293 BENCHMARK_GEMM(qs8_gemm_2x8c16__neon_mlal)
1294 BENCHMARK_GEMM(qs8_gemm_3x8c16__neon_mlal)
1295 BENCHMARK_GEMM(qs8_gemm_4x8c16__neon_mlal)
1296 BENCHMARK_GEMM(qs8_gemm_1x16c16__neon_mlal)
1297 BENCHMARK_GEMM(qs8_gemm_2x16c16__neon_mlal)
1298 BENCHMARK_GEMM(qs8_gemm_3x16c16__neon_mlal)
1299 BENCHMARK_GEMM(qs8_gemm_4x16c16__neon_mlal)
1300
1301 BENCHMARK_GEMM(qs8_gemm_1x8c4__neondot)
1302 BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)
1303 BENCHMARK_GEMM(qs8_gemm_6x8c4__neondot)
1304 BENCHMARK_GEMM(qs8_gemm_8x8c4__neondot)
1305 BENCHMARK_GEMM(qs8_gemm_1x16c4__neondot)
1306 BENCHMARK_GEMM(qs8_gemm_4x16c4__neondot)
1307 BENCHMARK_GEMM(qs8_gemm_6x16c4__neondot)
1308 BENCHMARK_GEMM(qs8_gemm_8x16c4__neondot)
1309 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1310
1311
1312 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1313 static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
1314 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, 2, 16, 8, 1,
1315 xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1316 }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)1317 static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
1318 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, 3, 16, 8, 1,
1319 xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1320 }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)1321 static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
1322 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, 4, 16, 8, 1,
1323 xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1324 }
1325
qs8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)1326 static void qs8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
1327 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1328 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1329 }
qs8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)1330 static void qs8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
1331 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1332 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1333 }
1334
qs8_gemm_xw_2x8c8__avx2(benchmark::State & state,const char * net)1335 static void qs8_gemm_xw_2x8c8__avx2(benchmark::State& state, const char* net) {
1336 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1337 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1338 }
qs8_gemm_xw_3x8c8__avx2(benchmark::State & state,const char * net)1339 static void qs8_gemm_xw_3x8c8__avx2(benchmark::State& state, const char* net) {
1340 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1341 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1342 }
1343
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)1344 static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
1345 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
1346 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1347 }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)1348 static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
1349 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
1350 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1351 }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)1352 static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
1353 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
1354 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1355 }
1356
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)1357 static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
1358 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
1359 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1360 }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)1361 static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
1362 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
1363 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1364 }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)1365 static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
1366 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
1367 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1368 }
1369
qs8_gemm_xw_2x4c2__xop(benchmark::State & state,const char * net)1370 static void qs8_gemm_xw_2x4c2__xop(benchmark::State& state, const char* net) {
1371 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, 2, 4, 2, 1,
1372 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1373 }
qs8_gemm_xw_3x4c2__xop(benchmark::State & state,const char * net)1374 static void qs8_gemm_xw_3x4c2__xop(benchmark::State& state, const char* net) {
1375 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, 3, 4, 2, 1,
1376 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1377 }
qs8_gemm_xw_4x4c2__xop(benchmark::State & state,const char * net)1378 static void qs8_gemm_xw_4x4c2__xop(benchmark::State& state, const char* net) {
1379 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, 4, 4, 2, 1,
1380 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1381 }
1382
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)1383 static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
1384 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
1385 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1386 }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)1387 static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
1388 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
1389 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1390 }
1391
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)1392 static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
1393 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
1394 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1395 }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)1396 static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
1397 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
1398 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1399 }
1400
qs8_gemm_xw_2x4c8__xop(benchmark::State & state,const char * net)1401 static void qs8_gemm_xw_2x4c8__xop(benchmark::State& state, const char* net) {
1402 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, 2, 4, 8, 1,
1403 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1404 }
qs8_gemm_xw_3x4c8__xop(benchmark::State & state,const char * net)1405 static void qs8_gemm_xw_3x4c8__xop(benchmark::State& state, const char* net) {
1406 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, 3, 4, 8, 1,
1407 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1408 }
1409
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)1410 static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
1411 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
1412 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1413 }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)1414 static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
1415 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
1416 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1417 }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)1418 static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
1419 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
1420 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1421 }
1422
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)1423 static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
1424 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
1425 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1426 }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)1427 static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
1428 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
1429 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1430 }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)1431 static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
1432 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
1433 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1434 }
1435
qs8_gemm_xw_2x4c2__avx(benchmark::State & state,const char * net)1436 static void qs8_gemm_xw_2x4c2__avx(benchmark::State& state, const char* net) {
1437 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, 2, 4, 2, 1,
1438 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1439 }
qs8_gemm_xw_3x4c2__avx(benchmark::State & state,const char * net)1440 static void qs8_gemm_xw_3x4c2__avx(benchmark::State& state, const char* net) {
1441 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, 3, 4, 2, 1,
1442 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1443 }
qs8_gemm_xw_4x4c2__avx(benchmark::State & state,const char * net)1444 static void qs8_gemm_xw_4x4c2__avx(benchmark::State& state, const char* net) {
1445 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, 4, 4, 2, 1,
1446 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1447 }
1448
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)1449 static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
1450 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
1451 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1452 }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)1453 static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
1454 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
1455 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1456 }
1457
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)1458 static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
1459 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
1460 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1461 }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)1462 static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
1463 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
1464 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1465 }
1466
qs8_gemm_xw_2x4c8__avx(benchmark::State & state,const char * net)1467 static void qs8_gemm_xw_2x4c8__avx(benchmark::State& state, const char* net) {
1468 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, 2, 4, 8, 1,
1469 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1470 }
qs8_gemm_xw_3x4c8__avx(benchmark::State & state,const char * net)1471 static void qs8_gemm_xw_3x4c8__avx(benchmark::State& state, const char* net) {
1472 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, 3, 4, 8, 1,
1473 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1474 }
1475
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)1476 static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1477 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
1478 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1479 }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)1480 static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1481 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
1482 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1483 }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)1484 static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1485 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
1486 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1487 }
1488
qs8_gemm_2x4c2__sse41_ld128(benchmark::State & state,const char * net)1489 static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1490 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
1491 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1492 }
qs8_gemm_3x4c2__sse41_ld128(benchmark::State & state,const char * net)1493 static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1494 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
1495 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1496 }
qs8_gemm_4x4c2__sse41_ld128(benchmark::State & state,const char * net)1497 static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1498 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
1499 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1500 }
1501
qs8_gemm_xw_2x4c2__sse41(benchmark::State & state,const char * net)1502 static void qs8_gemm_xw_2x4c2__sse41(benchmark::State& state, const char* net) {
1503 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, 2, 4, 2, 1,
1504 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1505 }
qs8_gemm_xw_3x4c2__sse41(benchmark::State & state,const char * net)1506 static void qs8_gemm_xw_3x4c2__sse41(benchmark::State& state, const char* net) {
1507 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, 3, 4, 2, 1,
1508 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1509 }
qs8_gemm_xw_4x4c2__sse41(benchmark::State & state,const char * net)1510 static void qs8_gemm_xw_4x4c2__sse41(benchmark::State& state, const char* net) {
1511 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, 4, 4, 2, 1,
1512 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1513 }
1514
qs8_gemm_2x4c8__sse41_ld64(benchmark::State & state,const char * net)1515 static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1516 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
1517 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1518 }
qs8_gemm_3x4c8__sse41_ld64(benchmark::State & state,const char * net)1519 static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1520 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
1521 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1522 }
1523
qs8_gemm_2x4c8__sse41_ld128(benchmark::State & state,const char * net)1524 static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1525 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
1526 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1527 }
qs8_gemm_3x4c8__sse41_ld128(benchmark::State & state,const char * net)1528 static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1529 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
1530 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1531 }
1532
qs8_gemm_xw_2x4c8__sse41(benchmark::State & state,const char * net)1533 static void qs8_gemm_xw_2x4c8__sse41(benchmark::State& state, const char* net) {
1534 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, 2, 4, 8, 1,
1535 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1536 }
qs8_gemm_xw_3x4c8__sse41(benchmark::State & state,const char * net)1537 static void qs8_gemm_xw_3x4c8__sse41(benchmark::State& state, const char* net) {
1538 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, 3, 4, 8, 1,
1539 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1540 }
1541
qs8_gemm_2x4c8__ssse3_ld64(benchmark::State & state,const char * net)1542 static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
1543 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, 2, 4, 8, 1,
1544 xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1545 }
qs8_gemm_3x4c8__ssse3_ld64(benchmark::State & state,const char * net)1546 static void qs8_gemm_3x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
1547 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, 3, 4, 8, 1,
1548 xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1549 }
1550
qs8_gemm_2x4c8__ssse3_ld128(benchmark::State & state,const char * net)1551 static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, const char* net) {
1552 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1,
1553 xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1554 }
qs8_gemm_3x4c8__ssse3_ld128(benchmark::State & state,const char * net)1555 static void qs8_gemm_3x4c8__ssse3_ld128(benchmark::State& state, const char* net) {
1556 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, 3, 4, 8, 1,
1557 xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1558 }
1559
qs8_gemm_xw_2x4c8__ssse3(benchmark::State & state,const char * net)1560 static void qs8_gemm_xw_2x4c8__ssse3(benchmark::State& state, const char* net) {
1561 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
1562 xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
1563 }
qs8_gemm_xw_3x4c8__ssse3(benchmark::State & state,const char * net)1564 static void qs8_gemm_xw_3x4c8__ssse3(benchmark::State& state, const char* net) {
1565 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, 3, 4, 8, 1,
1566 xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
1567 }
1568
qs8_gemm_2x4c2__sse2_ld64(benchmark::State & state,const char * net)1569 static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1570 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, 2, 4, 2, 1,
1571 xnn_init_qs8_conv_minmax_fp32_sse2_params);
1572 }
qs8_gemm_3x4c2__sse2_ld64(benchmark::State & state,const char * net)1573 static void qs8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1574 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, 3, 4, 2, 1,
1575 xnn_init_qs8_conv_minmax_fp32_sse2_params);
1576 }
qs8_gemm_4x4c2__sse2_ld64(benchmark::State & state,const char * net)1577 static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1578 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1,
1579 xnn_init_qs8_conv_minmax_fp32_sse2_params);
1580 }
1581
qs8_gemm_2x4c2__sse2_ld128(benchmark::State & state,const char * net)1582 static void qs8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1583 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, 2, 4, 2, 1,
1584 xnn_init_qs8_conv_minmax_fp32_sse2_params);
1585 }
qs8_gemm_3x4c2__sse2_ld128(benchmark::State & state,const char * net)1586 static void qs8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1587 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, 3, 4, 2, 1,
1588 xnn_init_qs8_conv_minmax_fp32_sse2_params);
1589 }
qs8_gemm_4x4c2__sse2_ld128(benchmark::State & state,const char * net)1590 static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1591 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1,
1592 xnn_init_qs8_conv_minmax_fp32_sse2_params);
1593 }
1594
qs8_gemm_xw_2x4c2__sse2(benchmark::State & state,const char * net)1595 static void qs8_gemm_xw_2x4c2__sse2(benchmark::State& state, const char* net) {
1596 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, 2, 4, 2, 1,
1597 xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1598 }
qs8_gemm_xw_3x4c2__sse2(benchmark::State & state,const char * net)1599 static void qs8_gemm_xw_3x4c2__sse2(benchmark::State& state, const char* net) {
1600 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, 3, 4, 2, 1,
1601 xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1602 }
qs8_gemm_xw_4x4c2__sse2(benchmark::State & state,const char * net)1603