• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <xnnpack.h>
13 
14 #include <benchmark/benchmark.h>
15 
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/gemm.h>
20 #include <xnnpack/igemm.h>
21 #include <xnnpack/params.h>
22 #include <xnnpack/params-init.h>
23 
24 
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qs8_gemm_minmax_ukernel_function gemm,xnn_qs8_igemm_minmax_ukernel_function igemm,xnn_qs8_gemm_minmax_ukernel_function gemm1,xnn_qs8_igemm_minmax_ukernel_function igemm1,xnn_init_qs8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void GEMMEnd2EndBenchmark(
26   benchmark::State& state,
27   models::ExecutionPlanFactory model_factory,
28   xnn_qs8_gemm_minmax_ukernel_function gemm,
29   xnn_qs8_igemm_minmax_ukernel_function igemm,
30   xnn_qs8_gemm_minmax_ukernel_function gemm1,
31   xnn_qs8_igemm_minmax_ukernel_function igemm1,
32   xnn_init_qs8_conv_minmax_params_fn init_params,
33   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
34   benchmark::utils::IsaCheckFunction isa_check = nullptr)
35 {
36   if (isa_check && !isa_check(state)) {
37     return;
38   }
39   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
40     state.SkipWithError("failed to initialize XNNPACK");
41     return;
42   }
43 
44   // Override microkernels chosen in xnn_initialize
45   // Note: do not directly assign to xnn_params.qs8.gemm because it breaks older gcc.
46   xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
47   xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
48   xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
49   xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
50   xnn_params.qs8.gemm.init.qs8 = init_params;
51   xnn_params.qs8.gemm.mr = mr;
52   xnn_params.qs8.gemm.nr = nr;
53   xnn_params.qs8.gemm.log2_kr = log2_kr;
54   xnn_params.qs8.gemm.log2_sr = log2_sr;
55 
56   auto execution_plan = model_factory(nullptr);
57   if (execution_plan.empty()) {
58     state.SkipWithError("failed to create a model");
59     return;
60   }
61 
62   for (auto _ : state) {
63     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
64       xnn_status status = xnn_run_operator(op.get(), nullptr);
65       if (status != xnn_status_success) {
66         state.SkipWithError("failed to run a model");
67         return;
68       }
69     }
70   }
71 
72   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
73   if (cpu_frequency != 0) {
74     state.counters["cpufreq"] = cpu_frequency;
75   }
76 }
77 
78 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)79   static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
80     GEMMEnd2EndBenchmark(state, model,
81       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
82       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
83       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
84       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
85       xnn_init_qs8_conv_minmax_rndnu_neon_params,
86       4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
87       benchmark::utils::CheckNEONDOT);
88   }
qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)89   static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
90     GEMMEnd2EndBenchmark(state, model,
91       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
92       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
93       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
94       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
95       xnn_init_qs8_conv_minmax_rndnu_neon_params,
96       4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
97       benchmark::utils::CheckNEONDOT);
98   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)99   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
100     GEMMEnd2EndBenchmark(state, model,
101       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
102       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
103       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
104       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
105       xnn_init_qs8_conv_minmax_rndnu_neon_params,
106       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
107       benchmark::utils::CheckNEON);
108   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)109   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
110     GEMMEnd2EndBenchmark(state, model,
111       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
112       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
113       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
114       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
115       xnn_init_qs8_conv_minmax_rndnu_neon_params,
116       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
117       benchmark::utils::CheckNEON);
118   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)119   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
120     GEMMEnd2EndBenchmark(state, model,
121       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
122       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
123       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
124       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
125       xnn_init_qs8_conv_minmax_rndnu_neon_params,
126       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
127       benchmark::utils::CheckNEON);
128   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)129   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
130     GEMMEnd2EndBenchmark(state, model,
131       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
132       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
133       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
134       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
135       xnn_init_qs8_conv_minmax_rndnu_neon_params,
136       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
137       benchmark::utils::CheckNEON);
138   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)139   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
140     GEMMEnd2EndBenchmark(state, model,
141       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
142       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
143       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
144       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
145       xnn_init_qs8_conv_minmax_rndnu_neon_params,
146       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
147       benchmark::utils::CheckNEON);
148   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)149   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
150     GEMMEnd2EndBenchmark(state, model,
151       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
152       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
153       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
154       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
155       xnn_init_qs8_conv_minmax_rndnu_neon_params,
156       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
157       benchmark::utils::CheckNEON);
158   }
159   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)160   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)
161   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
162   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
163   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
164   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
165   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
166   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
167 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
168 
169 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
170   static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
171     GEMMEnd2EndBenchmark(state, model,
172       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
173       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
174       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
175       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
176       xnn_init_qs8_conv_minmax_rndnu_neon_params,
177       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
178       benchmark::utils::CheckNEONDOT);
179   }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,models::ExecutionPlanFactory model)180   static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
181     GEMMEnd2EndBenchmark(state, model,
182       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32,
183       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
184       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32,
185       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
186       xnn_init_qs8_conv_minmax_rndnu_neon_params,
187       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
188       benchmark::utils::CheckNEONDOT);
189   }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)190   static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
191     GEMMEnd2EndBenchmark(state, model,
192       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
193       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
194       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
195       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
196       xnn_init_qs8_conv_minmax_rndnu_neon_params,
197       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
198       benchmark::utils::CheckNEONDOT);
199   }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)200   static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
201     GEMMEnd2EndBenchmark(state, model,
202       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
203       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
204       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
205       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
206       xnn_init_qs8_conv_minmax_rndnu_neon_params,
207       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
208       benchmark::utils::CheckNEONDOT);
209   }
qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)210   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
211     GEMMEnd2EndBenchmark(state, model,
212       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
213       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
214       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
215       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
216       xnn_init_qs8_conv_minmax_rndnu_neon_params,
217       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
218       benchmark::utils::CheckNEON);
219   }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)220   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
221     GEMMEnd2EndBenchmark(state, model,
222       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
223       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
224       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
225       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
226       xnn_init_qs8_conv_minmax_rndnu_neon_params,
227       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
228       benchmark::utils::CheckNEON);
229   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)230   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
231     GEMMEnd2EndBenchmark(state, model,
232       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
233       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
234       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
235       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
236       xnn_init_qs8_conv_minmax_rndnu_neon_params,
237       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
238       benchmark::utils::CheckNEON);
239   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)240   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
241     GEMMEnd2EndBenchmark(state, model,
242       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
243       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
244       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
245       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
246       xnn_init_qs8_conv_minmax_rndnu_neon_params,
247       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
248       benchmark::utils::CheckNEON);
249   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)250   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
251     GEMMEnd2EndBenchmark(state, model,
252       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
253       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
254       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
255       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
256       xnn_init_qs8_conv_minmax_rndnu_neon_params,
257       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
258       benchmark::utils::CheckNEON);
259   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)260   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
261     GEMMEnd2EndBenchmark(state, model,
262       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
263       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
264       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
265       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
266       xnn_init_qs8_conv_minmax_rndnu_neon_params,
267       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
268       benchmark::utils::CheckNEON);
269   }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)270   static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
271     GEMMEnd2EndBenchmark(state, model,
272       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
273       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
274       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
275       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
276       xnn_init_qs8_conv_minmax_rndnu_neon_params,
277       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
278       benchmark::utils::CheckNEON);
279   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,models::ExecutionPlanFactory model)280   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
281     GEMMEnd2EndBenchmark(state, model,
282       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
283       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
284       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
285       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
286       xnn_init_qs8_conv_minmax_rndnu_neon_params,
287       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
288       benchmark::utils::CheckNEON);
289   }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)290   static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
291     GEMMEnd2EndBenchmark(state, model,
292       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53,
293       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
294       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
295       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
296       xnn_init_qs8_conv_minmax_rndnu_neon_params,
297       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
298       benchmark::utils::CheckNEON);
299   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)300   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
301     GEMMEnd2EndBenchmark(state, model,
302       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
303       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
304       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
305       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
306       xnn_init_qs8_conv_minmax_rndnu_neon_params,
307       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
308       benchmark::utils::CheckNEON);
309   }
310 
311   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)312   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)
313   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld64)
314   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
315   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
316   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
317   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
318   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
319   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
320   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
321   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
322   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
323   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
324   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal)
325 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
326 
327 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
328   static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
329     GEMMEnd2EndBenchmark(state, model,
330       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
331       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
332       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
333       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
334       xnn_init_qs8_conv_minmax_rndnu_neon_params,
335       2 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
336       benchmark::utils::CheckNEON);
337   }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)338   static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
339     GEMMEnd2EndBenchmark(state, model,
340       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
341       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
342       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
343       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
344       xnn_init_qs8_conv_minmax_rndnu_neon_params,
345       2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
346       benchmark::utils::CheckNEON);
347   }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)348   static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
349     GEMMEnd2EndBenchmark(state, model,
350       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
351       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
352       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
353       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
354       xnn_init_qs8_conv_minmax_rndnu_neon_params,
355       3 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
356       benchmark::utils::CheckNEON);
357   }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)358   static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
359     GEMMEnd2EndBenchmark(state, model,
360       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
361       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
362       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
363       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
364       xnn_init_qs8_conv_minmax_rndnu_neon_params,
365       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
366       benchmark::utils::CheckNEON);
367   }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)368   static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
369     GEMMEnd2EndBenchmark(state, model,
370       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
371       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
372       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
373       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
374       xnn_init_qs8_conv_minmax_rndnu_neon_params,
375       4 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
376       benchmark::utils::CheckNEON);
377   }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)378   static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
379     GEMMEnd2EndBenchmark(state, model,
380       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
381       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
382       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
383       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
384       xnn_init_qs8_conv_minmax_rndnu_neon_params,
385       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
386       benchmark::utils::CheckNEON);
387   }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)388   static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
389     GEMMEnd2EndBenchmark(state, model,
390       xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
391       xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
392       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
393       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
394       xnn_init_qs8_conv_minmax_rndnu_neon_params,
395       6 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
396       benchmark::utils::CheckNEON);
397   }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)398   static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
399     GEMMEnd2EndBenchmark(state, model,
400       xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
401       xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
402       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
403       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
404       xnn_init_qs8_conv_minmax_rndnu_neon_params,
405       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
406       benchmark::utils::CheckNEON);
407   }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)408   static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
409     GEMMEnd2EndBenchmark(state, model,
410       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
411       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
412       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
413       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
414       xnn_init_qs8_conv_minmax_rndnu_neon_params,
415       2 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
416       benchmark::utils::CheckNEON);
417   }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)418   static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
419     GEMMEnd2EndBenchmark(state, model,
420       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
421       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
422       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
423       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
424       xnn_init_qs8_conv_minmax_rndnu_neon_params,
425       2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
426       benchmark::utils::CheckNEON);
427   }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)428   static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
429     GEMMEnd2EndBenchmark(state, model,
430       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
431       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
432       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
433       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
434       xnn_init_qs8_conv_minmax_rndnu_neon_params,
435       3 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
436       benchmark::utils::CheckNEON);
437   }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)438   static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
439     GEMMEnd2EndBenchmark(state, model,
440       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
441       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
442       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
443       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
444       xnn_init_qs8_conv_minmax_rndnu_neon_params,
445       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
446       benchmark::utils::CheckNEON);
447   }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)448   static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
449     GEMMEnd2EndBenchmark(state, model,
450       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
451       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
452       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
453       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
454       xnn_init_qs8_conv_minmax_rndnu_neon_params,
455       4 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
456       benchmark::utils::CheckNEON);
457   }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)458   static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
459     GEMMEnd2EndBenchmark(state, model,
460       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
461       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
462       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
463       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
464       xnn_init_qs8_conv_minmax_rndnu_neon_params,
465       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
466       benchmark::utils::CheckNEON);
467   }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)468   static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
469     GEMMEnd2EndBenchmark(state, model,
470       xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
471       xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
472       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
473       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
474       xnn_init_qs8_conv_minmax_rndnu_neon_params,
475       6 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
476       benchmark::utils::CheckNEON);
477   }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)478   static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
479     GEMMEnd2EndBenchmark(state, model,
480       xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
481       xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
482       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
483       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
484       xnn_init_qs8_conv_minmax_rndnu_neon_params,
485       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
486       benchmark::utils::CheckNEON);
487   }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)488   static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
489     GEMMEnd2EndBenchmark(state, model,
490       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
491       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
492       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
493       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
494       xnn_init_qs8_conv_minmax_rndnu_neon_params,
495       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
496       benchmark::utils::CheckNEON);
497   }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)498   static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
499     GEMMEnd2EndBenchmark(state, model,
500       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
501       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
502       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
503       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
504       xnn_init_qs8_conv_minmax_rndnu_neon_params,
505       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
506       benchmark::utils::CheckNEON);
507   }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)508   static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
509     GEMMEnd2EndBenchmark(state, model,
510       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
511       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
512       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
513       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
514       xnn_init_qs8_conv_minmax_rndnu_neon_params,
515       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
516       benchmark::utils::CheckNEON);
517   }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)518   static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
519     GEMMEnd2EndBenchmark(state, model,
520       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
521       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
522       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
523       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
524       xnn_init_qs8_conv_minmax_rndnu_neon_params,
525       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
526       benchmark::utils::CheckNEON);
527   }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)528   static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
529     GEMMEnd2EndBenchmark(state, model,
530       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
531       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
532       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
533       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
534       xnn_init_qs8_conv_minmax_rndnu_neon_params,
535       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
536       benchmark::utils::CheckNEON);
537   }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)538   static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
539     GEMMEnd2EndBenchmark(state, model,
540       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
541       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
542       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
543       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
544       xnn_init_qs8_conv_minmax_rndnu_neon_params,
545       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
546       benchmark::utils::CheckNEON);
547   }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)548   static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
549     GEMMEnd2EndBenchmark(state, model,
550       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
551       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
552       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
553       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
554       xnn_init_qs8_conv_minmax_rndnu_neon_params,
555       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
556       benchmark::utils::CheckNEON);
557   }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)558   static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
559     GEMMEnd2EndBenchmark(state, model,
560       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
561       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
562       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
563       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
564       xnn_init_qs8_conv_minmax_rndnu_neon_params,
565       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
566       benchmark::utils::CheckNEON);
567   }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)568   static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
569     GEMMEnd2EndBenchmark(state, model,
570       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
571       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
572       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
573       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
574       xnn_init_qs8_conv_minmax_rndnu_neon_params,
575       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
576       benchmark::utils::CheckNEON);
577   }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)578   static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
579     GEMMEnd2EndBenchmark(state, model,
580       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
581       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
582       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
583       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
584       xnn_init_qs8_conv_minmax_rndnu_neon_params,
585       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
586       benchmark::utils::CheckNEON);
587   }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)588   static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
589     GEMMEnd2EndBenchmark(state, model,
590       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
591       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
592       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
593       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
594       xnn_init_qs8_conv_minmax_rndnu_neon_params,
595       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
596       benchmark::utils::CheckNEON);
597   }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)598   static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
599     GEMMEnd2EndBenchmark(state, model,
600       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
601       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
602       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
603       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
604       xnn_init_qs8_conv_minmax_rndnu_neon_params,
605       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
606       benchmark::utils::CheckNEON);
607   }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)608   static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
609     GEMMEnd2EndBenchmark(state, model,
610       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
611       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
612       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
613       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
614       xnn_init_qs8_conv_minmax_rndnu_neon_params,
615       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
616       benchmark::utils::CheckNEON);
617   }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)618   static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
619     GEMMEnd2EndBenchmark(state, model,
620       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
621       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
622       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
623       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
624       xnn_init_qs8_conv_minmax_rndnu_neon_params,
625       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
626       benchmark::utils::CheckNEON);
627   }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)628   static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
629     GEMMEnd2EndBenchmark(state, model,
630       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
631       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
632       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
633       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
634       xnn_init_qs8_conv_minmax_rndnu_neon_params,
635       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
636       benchmark::utils::CheckNEON);
637   }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)638   static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
639     GEMMEnd2EndBenchmark(state, model,
640       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
641       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
642       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
643       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
644       xnn_init_qs8_conv_minmax_rndnu_neon_params,
645       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
646       benchmark::utils::CheckNEON);
647   }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)648   static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
649     GEMMEnd2EndBenchmark(state, model,
650       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
651       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
652       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
653       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
654       xnn_init_qs8_conv_minmax_rndnu_neon_params,
655       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
656       benchmark::utils::CheckNEON);
657   }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)658   static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
659     GEMMEnd2EndBenchmark(state, model,
660       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
661       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
662       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
663       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
664       xnn_init_qs8_conv_minmax_rndnu_neon_params,
665       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
666       benchmark::utils::CheckNEON);
667   }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)668   static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
669     GEMMEnd2EndBenchmark(state, model,
670       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
671       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
672       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
673       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
674       xnn_init_qs8_conv_minmax_rndnu_neon_params,
675       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
676       benchmark::utils::CheckNEON);
677   }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)678   static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
679     GEMMEnd2EndBenchmark(state, model,
680       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
681       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
682       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
683       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
684       xnn_init_qs8_conv_minmax_rndnu_neon_params,
685       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
686       benchmark::utils::CheckNEON);
687   }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)688   static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
689     GEMMEnd2EndBenchmark(state, model,
690       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
691       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
692       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
693       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
694       xnn_init_qs8_conv_minmax_rndnu_neon_params,
695       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
696       benchmark::utils::CheckNEON);
697   }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)698   static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
699     GEMMEnd2EndBenchmark(state, model,
700       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
701       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
702       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
703       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
704       xnn_init_qs8_conv_minmax_rndnu_neon_params,
705       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
706       benchmark::utils::CheckNEON);
707   }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)708   static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
709     GEMMEnd2EndBenchmark(state, model,
710       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
711       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
712       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
713       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
714       xnn_init_qs8_conv_minmax_rndnu_neon_params,
715       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
716       benchmark::utils::CheckNEON);
717   }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)718   static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
719     GEMMEnd2EndBenchmark(state, model,
720       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
721       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
722       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
723       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
724       xnn_init_qs8_conv_minmax_rndnu_neon_params,
725       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
726       benchmark::utils::CheckNEON);
727   }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)728   static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
729     GEMMEnd2EndBenchmark(state, model,
730       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
731       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
732       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
733       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
734       xnn_init_qs8_conv_minmax_rndnu_neon_params,
735       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
736       benchmark::utils::CheckNEON);
737   }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)738   static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
739     GEMMEnd2EndBenchmark(state, model,
740       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
741       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
742       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
743       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
744       xnn_init_qs8_conv_minmax_rndnu_neon_params,
745       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
746       benchmark::utils::CheckNEON);
747   }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)748   static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
749     GEMMEnd2EndBenchmark(state, model,
750       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
751       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
752       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
753       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
754       xnn_init_qs8_conv_minmax_rndnu_neon_params,
755       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
756       benchmark::utils::CheckNEON);
757   }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)758   static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
759     GEMMEnd2EndBenchmark(state, model,
760       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
761       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
762       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
763       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
764       xnn_init_qs8_conv_minmax_rndnu_neon_params,
765       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
766       benchmark::utils::CheckNEON);
767   }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)768   static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
769     GEMMEnd2EndBenchmark(state, model,
770       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
771       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
772       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
773       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
774       xnn_init_qs8_conv_minmax_rndnu_neon_params,
775       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
776       benchmark::utils::CheckNEON);
777   }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)778   static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
779     GEMMEnd2EndBenchmark(state, model,
780       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
781       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
782       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
783       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
784       xnn_init_qs8_conv_minmax_rndnu_neon_params,
785       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
786       benchmark::utils::CheckNEON);
787   }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)788   static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
789     GEMMEnd2EndBenchmark(state, model,
790       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
791       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
792       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
793       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
794       xnn_init_qs8_conv_minmax_rndnu_neon_params,
795       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
796       benchmark::utils::CheckNEON);
797   }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)798   static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
799     GEMMEnd2EndBenchmark(state, model,
800       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
801       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
802       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
803       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
804       xnn_init_qs8_conv_minmax_rndnu_neon_params,
805       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
806       benchmark::utils::CheckNEON);
807   }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)808   static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
809     GEMMEnd2EndBenchmark(state, model,
810       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
811       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
812       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
813       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
814       xnn_init_qs8_conv_minmax_rndnu_neon_params,
815       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
816       benchmark::utils::CheckNEON);
817   }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)818   static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
819     GEMMEnd2EndBenchmark(state, model,
820       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
821       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
822       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
823       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
824       xnn_init_qs8_conv_minmax_rndnu_neon_params,
825       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
826       benchmark::utils::CheckNEON);
827   }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)828   static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
829     GEMMEnd2EndBenchmark(state, model,
830       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
831       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
832       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
833       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
834       xnn_init_qs8_conv_minmax_rndnu_neon_params,
835       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
836       benchmark::utils::CheckNEON);
837   }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)838   static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
839     GEMMEnd2EndBenchmark(state, model,
840       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
841       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
842       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
843       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
844       xnn_init_qs8_conv_minmax_rndnu_neon_params,
845       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
846       benchmark::utils::CheckNEON);
847   }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)848   static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
849     GEMMEnd2EndBenchmark(state, model,
850       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
851       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
852       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
853       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
854       xnn_init_qs8_conv_minmax_rndnu_neon_params,
855       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
856       benchmark::utils::CheckNEON);
857   }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)858   static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
859     GEMMEnd2EndBenchmark(state, model,
860       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
861       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
862       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
863       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
864       xnn_init_qs8_conv_minmax_rndnu_neon_params,
865       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
866       benchmark::utils::CheckNEON);
867   }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)868   static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
869     GEMMEnd2EndBenchmark(state, model,
870       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
871       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
872       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
873       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
874       xnn_init_qs8_conv_minmax_rndnu_neon_params,
875       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
876       benchmark::utils::CheckNEON);
877   }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)878   static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
879     GEMMEnd2EndBenchmark(state, model,
880       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
881       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
882       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
883       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
884       xnn_init_qs8_conv_minmax_rndnu_neon_params,
885       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
886       benchmark::utils::CheckNEON);
887   }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)888   static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
889     GEMMEnd2EndBenchmark(state, model,
890       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
891       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
892       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
893       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
894       xnn_init_qs8_conv_minmax_rndnu_neon_params,
895       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
896       benchmark::utils::CheckNEON);
897   }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)898   static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
899     GEMMEnd2EndBenchmark(state, model,
900       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
901       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
902       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
903       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
904       xnn_init_qs8_conv_minmax_rndnu_neon_params,
905       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
906       benchmark::utils::CheckNEON);
907   }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)908   static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
909     GEMMEnd2EndBenchmark(state, model,
910       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
911       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
912       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
913       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
914       xnn_init_qs8_conv_minmax_rndnu_neon_params,
915       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
916       benchmark::utils::CheckNEON);
917   }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)918   static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
919     GEMMEnd2EndBenchmark(state, model,
920       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
921       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
922       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
923       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
924       xnn_init_qs8_conv_minmax_rndnu_neon_params,
925       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
926       benchmark::utils::CheckNEON);
927   }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)928   static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
929     GEMMEnd2EndBenchmark(state, model,
930       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
931       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
932       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
933       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
934       xnn_init_qs8_conv_minmax_rndnu_neon_params,
935       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
936       benchmark::utils::CheckNEON);
937   }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)938   static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
939     GEMMEnd2EndBenchmark(state, model,
940       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
941       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
942       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
943       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
944       xnn_init_qs8_conv_minmax_rndnu_neon_params,
945       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
946       benchmark::utils::CheckNEON);
947   }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)948   static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
949     GEMMEnd2EndBenchmark(state, model,
950       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
951       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
952       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
953       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
954       xnn_init_qs8_conv_minmax_rndnu_neon_params,
955       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
956       benchmark::utils::CheckNEON);
957   }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)958   static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
959     GEMMEnd2EndBenchmark(state, model,
960       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
961       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
962       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
963       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
964       xnn_init_qs8_conv_minmax_rndnu_neon_params,
965       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
966       benchmark::utils::CheckNEON);
967   }
qs8_gemm_2x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)968   static void qs8_gemm_2x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
969     GEMMEnd2EndBenchmark(state, model,
970       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
971       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
972       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
973       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
974       xnn_init_qs8_conv_minmax_rndnu_neon_params,
975       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
976       benchmark::utils::CheckNEON);
977   }
qs8_gemm_2x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)978   static void qs8_gemm_2x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
979     GEMMEnd2EndBenchmark(state, model,
980       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
981       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
982       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
983       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
984       xnn_init_qs8_conv_minmax_rndnu_neon_params,
985       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
986       benchmark::utils::CheckNEON);
987   }
qs8_gemm_3x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)988   static void qs8_gemm_3x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
989     GEMMEnd2EndBenchmark(state, model,
990       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
991       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
992       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
993       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
994       xnn_init_qs8_conv_minmax_rndnu_neon_params,
995       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
996       benchmark::utils::CheckNEON);
997   }
qs8_gemm_3x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)998   static void qs8_gemm_3x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
999     GEMMEnd2EndBenchmark(state, model,
1000       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1001       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1002       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1003       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1004       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1005       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1006       benchmark::utils::CheckNEON);
1007   }
qs8_gemm_4x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1008   static void qs8_gemm_4x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1009     GEMMEnd2EndBenchmark(state, model,
1010       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1011       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1012       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1013       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1014       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1015       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1016       benchmark::utils::CheckNEON);
1017   }
qs8_gemm_4x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1018   static void qs8_gemm_4x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1019     GEMMEnd2EndBenchmark(state, model,
1020       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1021       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1022       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1023       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1024       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1025       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1026       benchmark::utils::CheckNEON);
1027   }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1028   static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1029     GEMMEnd2EndBenchmark(state, model,
1030       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1031       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1032       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1033       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1034       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1035       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1036       benchmark::utils::CheckNEON);
1037   }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1038   static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1039     GEMMEnd2EndBenchmark(state, model,
1040       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1041       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1042       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1043       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1044       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1045       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1046       benchmark::utils::CheckNEON);
1047   }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1048   static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1049     GEMMEnd2EndBenchmark(state, model,
1050       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1051       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1052       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1053       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1054       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1055       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1056       benchmark::utils::CheckNEON);
1057   }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1058   static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1059     GEMMEnd2EndBenchmark(state, model,
1060       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1061       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1062       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1063       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1064       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1065       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1066       benchmark::utils::CheckNEON);
1067   }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1068   static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1069     GEMMEnd2EndBenchmark(state, model,
1070       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1071       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1072       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1073       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1074       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1075       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1076       benchmark::utils::CheckNEON);
1077   }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1078   static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1079     GEMMEnd2EndBenchmark(state, model,
1080       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1081       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1082       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1083       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1084       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1085       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1086       benchmark::utils::CheckNEON);
1087   }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1088   static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1089     GEMMEnd2EndBenchmark(state, model,
1090       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1091       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1092       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1093       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1094       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1095       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1096       benchmark::utils::CheckNEON);
1097   }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1098   static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1099     GEMMEnd2EndBenchmark(state, model,
1100       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1101       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1102       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1103       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1104       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1105       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1106       benchmark::utils::CheckNEON);
1107   }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1108   static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1109     GEMMEnd2EndBenchmark(state, model,
1110       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1111       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1112       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1113       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1114       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1115       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1116       benchmark::utils::CheckNEON);
1117   }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1118   static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1119     GEMMEnd2EndBenchmark(state, model,
1120       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1121       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1122       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1123       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1124       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1125       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1126       benchmark::utils::CheckNEON);
1127   }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1128   static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1129     GEMMEnd2EndBenchmark(state, model,
1130       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1131       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1132       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1133       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1134       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1135       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1136       benchmark::utils::CheckNEON);
1137   }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1138   static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1139     GEMMEnd2EndBenchmark(state, model,
1140       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1141       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1142       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1143       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1144       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1145       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1146       benchmark::utils::CheckNEON);
1147   }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1148   static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1149     GEMMEnd2EndBenchmark(state, model,
1150       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1151       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1152       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1153       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1154       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1155       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1156       benchmark::utils::CheckNEON);
1157   }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1158   static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1159     GEMMEnd2EndBenchmark(state, model,
1160       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1161       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1162       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1163       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1164       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1165       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1166       benchmark::utils::CheckNEON);
1167   }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1168   static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1169     GEMMEnd2EndBenchmark(state, model,
1170       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1171       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1172       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1173       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1174       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1175       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1176       benchmark::utils::CheckNEON);
1177   }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1178   static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1179     GEMMEnd2EndBenchmark(state, model,
1180       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1181       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1182       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1183       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1184       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1185       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1186       benchmark::utils::CheckNEON);
1187   }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1188   static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1189     GEMMEnd2EndBenchmark(state, model,
1190       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1191       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1192       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1193       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1194       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1195       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1196       benchmark::utils::CheckNEON);
1197   }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1198   static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1199     GEMMEnd2EndBenchmark(state, model,
1200       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1201       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1202       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1203       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1204       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1205       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1206       benchmark::utils::CheckNEON);
1207   }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1208   static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1209     GEMMEnd2EndBenchmark(state, model,
1210       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1211       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1212       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1213       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1214       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1215       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1216       benchmark::utils::CheckNEON);
1217   }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1218   static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1219     GEMMEnd2EndBenchmark(state, model,
1220       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1221       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1222       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1223       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1224       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1225       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1226       benchmark::utils::CheckNEON);
1227   }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1228   static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1229     GEMMEnd2EndBenchmark(state, model,
1230       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1231       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1232       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1233       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1234       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1235       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1236       benchmark::utils::CheckNEON);
1237   }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1238   static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1239     GEMMEnd2EndBenchmark(state, model,
1240       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1241       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1242       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1243       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1244       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1245       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1246       benchmark::utils::CheckNEON);
1247   }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1248   static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1249     GEMMEnd2EndBenchmark(state, model,
1250       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1251       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1252       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1253       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1254       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1255       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1256       benchmark::utils::CheckNEON);
1257   }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1258   static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1259     GEMMEnd2EndBenchmark(state, model,
1260       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1261       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1262       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1263       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1264       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1265       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1266       benchmark::utils::CheckNEON);
1267   }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1268   static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1269     GEMMEnd2EndBenchmark(state, model,
1270       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1271       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1272       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1273       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1274       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1275       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1276       benchmark::utils::CheckNEON);
1277   }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1278   static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1279     GEMMEnd2EndBenchmark(state, model,
1280       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1281       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1282       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1283       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1284       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1285       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1286       benchmark::utils::CheckNEON);
1287   }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1288   static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1289     GEMMEnd2EndBenchmark(state, model,
1290       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1291       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1292       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1293       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1294       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1295       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1296       benchmark::utils::CheckNEON);
1297   }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1298   static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1299     GEMMEnd2EndBenchmark(state, model,
1300       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1301       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1302       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1303       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1304       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1305       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1306       benchmark::utils::CheckNEON);
1307   }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1308   static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1309     GEMMEnd2EndBenchmark(state, model,
1310       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1311       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1312       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1313       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1314       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1315       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1316       benchmark::utils::CheckNEON);
1317   }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1318   static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1319     GEMMEnd2EndBenchmark(state, model,
1320       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1321       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1322       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1323       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1324       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1325       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1326       benchmark::utils::CheckNEON);
1327   }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1328   static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1329     GEMMEnd2EndBenchmark(state, model,
1330       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1331       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1332       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1333       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1334       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1335       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1336       benchmark::utils::CheckNEON);
1337   }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1338   static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1339     GEMMEnd2EndBenchmark(state, model,
1340       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1341       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1342       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1343       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1344       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1345       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1346       benchmark::utils::CheckNEON);
1347   }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1348   static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1349     GEMMEnd2EndBenchmark(state, model,
1350       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1351       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1352       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1353       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1354       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1355       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1356       benchmark::utils::CheckNEON);
1357   }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1358   static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1359     GEMMEnd2EndBenchmark(state, model,
1360       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1361       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1362       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1363       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1364       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1365       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1366       benchmark::utils::CheckNEON);
1367   }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1368   static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1369     GEMMEnd2EndBenchmark(state, model,
1370       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1371       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1372       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1373       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1374       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1375       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1376       benchmark::utils::CheckNEON);
1377   }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1378   static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1379     GEMMEnd2EndBenchmark(state, model,
1380       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1381       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1382       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1383       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1384       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1385       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1386       benchmark::utils::CheckNEON);
1387   }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1388   static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1389     GEMMEnd2EndBenchmark(state, model,
1390       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1391       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1392       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1393       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1394       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1395       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1396       benchmark::utils::CheckNEON);
1397   }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1398   static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1399     GEMMEnd2EndBenchmark(state, model,
1400       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1401       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1402       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1403       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1404       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1405       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1406       benchmark::utils::CheckNEON);
1407   }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1408   static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1409     GEMMEnd2EndBenchmark(state, model,
1410       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1411       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1412       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1413       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1414       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1415       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1416       benchmark::utils::CheckNEON);
1417   }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1418   static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1419     GEMMEnd2EndBenchmark(state, model,
1420       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1421       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1422       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1423       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1424       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1425       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1426       benchmark::utils::CheckNEON);
1427   }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1428   static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1429     GEMMEnd2EndBenchmark(state, model,
1430       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1431       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1432       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1433       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1434       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1435       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1436       benchmark::utils::CheckNEON);
1437   }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1438   static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1439     GEMMEnd2EndBenchmark(state, model,
1440       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1441       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1442       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1443       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1444       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1445       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1446       benchmark::utils::CheckNEON);
1447   }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1448   static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1449     GEMMEnd2EndBenchmark(state, model,
1450       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1451       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1452       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1453       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1454       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1455       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1456       benchmark::utils::CheckNEON);
1457   }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1458   static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1459     GEMMEnd2EndBenchmark(state, model,
1460       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1461       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1462       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1463       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1464       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1465       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1466       benchmark::utils::CheckNEON);
1467   }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1468   static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1469     GEMMEnd2EndBenchmark(state, model,
1470       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1471       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1472       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1473       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1474       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1475       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1476       benchmark::utils::CheckNEON);
1477   }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1478   static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1479     GEMMEnd2EndBenchmark(state, model,
1480       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1481       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1482       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1483       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1484       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1485       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1486       benchmark::utils::CheckNEON);
1487   }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1488   static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1489     GEMMEnd2EndBenchmark(state, model,
1490       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1491       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1492       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1493       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1494       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1495       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1496       benchmark::utils::CheckNEON);
1497   }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1498   static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1499     GEMMEnd2EndBenchmark(state, model,
1500       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1501       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1502       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1503       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1504       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1505       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1506       benchmark::utils::CheckNEON);
1507   }
qs8_gemm_2x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1508   static void qs8_gemm_2x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1509     GEMMEnd2EndBenchmark(state, model,
1510       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1511       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1512       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1513       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1514       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1515       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1516       benchmark::utils::CheckNEON);
1517   }
qs8_gemm_2x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1518   static void qs8_gemm_2x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1519     GEMMEnd2EndBenchmark(state, model,
1520       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1521       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1522       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1523       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1524       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1525       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1526       benchmark::utils::CheckNEON);
1527   }
qs8_gemm_3x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1528   static void qs8_gemm_3x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1529     GEMMEnd2EndBenchmark(state, model,
1530       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1531       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1532       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1533       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1534       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1535       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1536       benchmark::utils::CheckNEON);
1537   }
qs8_gemm_3x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1538   static void qs8_gemm_3x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1539     GEMMEnd2EndBenchmark(state, model,
1540       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1541       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1542       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1543       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1544       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1545       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1546       benchmark::utils::CheckNEON);
1547   }
qs8_gemm_4x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1548   static void qs8_gemm_4x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1549     GEMMEnd2EndBenchmark(state, model,
1550       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1551       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1552       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1553       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1554       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1555       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1556       benchmark::utils::CheckNEON);
1557   }
qs8_gemm_4x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1558   static void qs8_gemm_4x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1559     GEMMEnd2EndBenchmark(state, model,
1560       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1561       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1562       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1563       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1564       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1565       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1566       benchmark::utils::CheckNEON);
1567   }
qs8_gemm_4x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1568   static void qs8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1569     GEMMEnd2EndBenchmark(state, model,
1570       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
1571       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
1572       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
1573       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
1574       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1575       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1576       benchmark::utils::CheckNEONDOT);
1577   }
qs8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1578   static void qs8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1579     GEMMEnd2EndBenchmark(state, model,
1580       xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
1581       xnn_qs8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
1582       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
1583       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
1584       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1585       6 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1586       benchmark::utils::CheckNEONDOT);
1587   }
qs8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1588   static void qs8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1589     GEMMEnd2EndBenchmark(state, model,
1590       xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
1591       xnn_qs8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
1592       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
1593       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
1594       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1595       8 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1596       benchmark::utils::CheckNEONDOT);
1597   }
qs8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1598   static void qs8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1599     GEMMEnd2EndBenchmark(state, model,
1600       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
1601       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
1602       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
1603       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
1604       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1605       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1606       benchmark::utils::CheckNEONDOT);
1607   }
qs8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1608   static void qs8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1609     GEMMEnd2EndBenchmark(state, model,
1610       xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
1611       xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
1612       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
1613       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
1614       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1615       6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1616       benchmark::utils::CheckNEONDOT);
1617   }
qs8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1618   static void qs8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1619     GEMMEnd2EndBenchmark(state, model,
1620       xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
1621       xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
1622       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
1623       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
1624       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1625       8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1626       benchmark::utils::CheckNEONDOT);
1627   }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1628   static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1629     GEMMEnd2EndBenchmark(state, model,
1630       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1631       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1632       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1633       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1634       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1635       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1636       benchmark::utils::CheckNEON);
1637   }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1638   static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1639     GEMMEnd2EndBenchmark(state, model,
1640       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1641       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1642       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1643       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1644       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1645       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1646       benchmark::utils::CheckNEON);
1647   }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1648   static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1649     GEMMEnd2EndBenchmark(state, model,
1650       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1651       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1652       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1653       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1654       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1655       3 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1656       benchmark::utils::CheckNEON);
1657   }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1658   static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1659     GEMMEnd2EndBenchmark(state, model,
1660       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1661       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1662       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1663       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1664       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1665       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1666       benchmark::utils::CheckNEON);
1667   }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1668   static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1669     GEMMEnd2EndBenchmark(state, model,
1670       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1671       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1672       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1673       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1674       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1675       4 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1676       benchmark::utils::CheckNEON);
1677   }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1678   static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1679     GEMMEnd2EndBenchmark(state, model,
1680       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1681       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1682       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1683       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1684       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1685       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1686       benchmark::utils::CheckNEON);
1687   }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1688   static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1689     GEMMEnd2EndBenchmark(state, model,
1690       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1691       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1692       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1693       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1694       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1695       2 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1696       benchmark::utils::CheckNEON);
1697   }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1698   static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1699     GEMMEnd2EndBenchmark(state, model,
1700       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1701       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1702       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1703       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1704       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1705       2 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1706       benchmark::utils::CheckNEON);
1707   }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1708   static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1709     GEMMEnd2EndBenchmark(state, model,
1710       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1711       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1712       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1713       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1714       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1715       4 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1716       benchmark::utils::CheckNEON);
1717   }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1718   static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1719     GEMMEnd2EndBenchmark(state, model,
1720       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1721       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1722       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1723       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1724       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1725       4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1726       benchmark::utils::CheckNEON);
1727   }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1728   static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1729     GEMMEnd2EndBenchmark(state, model,
1730       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1731       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1732       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1733       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1734       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1735       4 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1736       benchmark::utils::CheckNEON);
1737   }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1738   static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1739     GEMMEnd2EndBenchmark(state, model,
1740       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1741       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1742       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1743       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1744       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1745       4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1746       benchmark::utils::CheckNEON);
1747   }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1748   static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1749     GEMMEnd2EndBenchmark(state, model,
1750       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1751       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1752       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1753       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1754       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1755       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1756       benchmark::utils::CheckNEON);
1757   }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1758   static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1759     GEMMEnd2EndBenchmark(state, model,
1760       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1761       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1762       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1763       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1764       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1765       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1766       benchmark::utils::CheckNEON);
1767   }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1768   static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1769     GEMMEnd2EndBenchmark(state, model,
1770       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1771       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1772       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1773       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1774       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1775       3 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1776       benchmark::utils::CheckNEON);
1777   }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1778   static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1779     GEMMEnd2EndBenchmark(state, model,
1780       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1781       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1782       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1783       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1784       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1785       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1786       benchmark::utils::CheckNEON);
1787   }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1788   static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1789     GEMMEnd2EndBenchmark(state, model,
1790       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1791       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1792       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1793       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1794       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1795       4 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1796       benchmark::utils::CheckNEON);
1797   }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1798   static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1799     GEMMEnd2EndBenchmark(state, model,
1800       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1801       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1802       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1803       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1804       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1805       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1806       benchmark::utils::CheckNEON);
1807   }
1808 
1809   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neondot);
1810   BENCHMARK_QS8_END2END(qs8_gemm_6x8c4__neondot);
1811   BENCHMARK_QS8_END2END(qs8_gemm_8x8c4__neondot);
1812   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neondot);
1813   BENCHMARK_QS8_END2END(qs8_gemm_6x16c4__neondot);
1814   BENCHMARK_QS8_END2END(qs8_gemm_8x16c4__neondot);
1815 
1816   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mlal);
1817   BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mlal);
1818   BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mlal);
1819   BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mlal);
1820   BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mlal);
1821   BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mlal);
1822 
1823   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mull);
1824   BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mull);
1825   BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mull);
1826   BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mull);
1827   BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mull);
1828   BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mull);
1829 
1830   BENCHMARK_QS8_END2END(qs8_gemm_2x8c16__neon_mlal);
1831   BENCHMARK_QS8_END2END(qs8_gemm_2x16c16__neon_mlal);
1832   BENCHMARK_QS8_END2END(qs8_gemm_3x8c16__neon_mlal);
1833   BENCHMARK_QS8_END2END(qs8_gemm_3x16c16__neon_mlal);
1834   BENCHMARK_QS8_END2END(qs8_gemm_4x8c16__neon_mlal);
1835   BENCHMARK_QS8_END2END(qs8_gemm_4x16c16__neon_mlal);
1836 
1837   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_dup);
1838   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_dup);
1839   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_dup);
1840   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_dup);
1841   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_dup);
1842   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_dup);
1843 
1844   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_dup);
1845   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_dup);
1846   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_dup);
1847   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_dup);
1848   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_dup);
1849   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_dup);
1850 
1851   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld1r);
1852   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld1r);
1853   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld1r);
1854   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld1r);
1855   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld1r);
1856   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld1r);
1857 
1858   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld1r);
1859   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld1r);
1860   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld1r);
1861   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld1r);
1862   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld1r);
1863   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld1r);
1864 
1865   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld2r);
1866   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld2r);
1867   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld2r);
1868   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld2r);
1869   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld2r);
1870   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld2r);
1871 
1872   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld2r);
1873   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld2r);
1874   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld2r);
1875   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld2r);
1876   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld2r);
1877   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld2r);
1878 
1879   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mlal);
1880   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mlal);
1881   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mlal);
1882   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mlal);
1883   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mlal);
1884   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mlal);
1885 
1886   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mull);
1887   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mull);
1888   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mull);
1889   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mull);
1890   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mull);
1891   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mull);
1892 
1893   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_dup);
1894   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_dup);
1895   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_dup);
1896   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_dup);
1897   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_dup);
1898   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_dup);
1899 
1900   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_dup);
1901   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_dup);
1902   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_dup);
1903   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_dup);
1904   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_dup);
1905   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_dup);
1906 
1907   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld1r);
1908   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld1r);
1909   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld1r);
1910   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld1r);
1911   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld1r);
1912   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld1r);
1913 
1914   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld1r);
1915   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld1r);
1916   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld1r);
1917   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld1r);
1918   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld1r);
1919   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld1r);
1920 
1921   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld2r);
1922   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld2r);
1923   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld2r);
1924   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld2r);
1925   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld2r);
1926   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld2r);
1927 
1928   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld2r);
1929   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld2r);
1930   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld2r);
1931   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld2r);
1932   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld2r);
1933   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld2r);
1934 
1935   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld4r);
1936   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld4r);
1937   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld4r);
1938   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld4r);
1939   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld4r);
1940   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld4r);
1941 
1942   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld4r);
1943   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld4r);
1944   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld4r);
1945   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld4r);
1946   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld4r);
1947   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld4r);
1948 
1949   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mlal);
1950   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mlal);
1951   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mlal);
1952   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mlal);
1953   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mlal);
1954   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mlal);
1955 
1956   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mull);
1957   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mull);
1958   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mull);
1959   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mull);
1960   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mull);
1961   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mull);
1962 
1963   BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane);
1964   BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane);
1965   BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane);
1966   BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane);
1967   BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane);
1968   BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane);
1969   BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane);
1970   BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane);
1971 
1972   BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane_prfm);
1973   BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane_prfm);
1974   BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane_prfm);
1975   BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane_prfm);
1976   BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane_prfm);
1977   BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane_prfm);
1978   BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane_prfm);
1979   BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane_prfm);
1980 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1981 
1982 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)1983   static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
1984     GEMMEnd2EndBenchmark(state, model,
1985       xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
1986       xnn_qs8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
1987       xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1988       xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1989       xnn_init_qs8_conv_minmax_fp32_avx512_params,
1990       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1991       benchmark::utils::CheckAVX512F);
1992   }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)1993   static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
1994     GEMMEnd2EndBenchmark(state, model,
1995       xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
1996       xnn_qs8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
1997       xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1998       xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1999       xnn_init_qs8_conv_minmax_fp32_avx512_params,
2000       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2001       benchmark::utils::CheckAVX512F);
2002   }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2003   static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2004     GEMMEnd2EndBenchmark(state, model,
2005       xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2006       xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2007       xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2008       xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2009       xnn_init_qs8_conv_minmax_fp32_avx512_params,
2010       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2011       benchmark::utils::CheckAVX512F);
2012   }
qs8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2013   static void qs8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2014     GEMMEnd2EndBenchmark(state, model,
2015       xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
2016       xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
2017       xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2018       xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2019       xnn_init_qs8_conv_minmax_fp32_avx2_params,
2020       2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2021       benchmark::utils::CheckAVX2);
2022   }
qs8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2023   static void qs8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2024     GEMMEnd2EndBenchmark(state, model,
2025       xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
2026       xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
2027       xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2028       xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2029       xnn_init_qs8_conv_minmax_fp32_avx2_params,
2030       3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2031       benchmark::utils::CheckAVX2);
2032   }
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2033   static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2034     GEMMEnd2EndBenchmark(state, model,
2035       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2036       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2037       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2038       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2039       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2040       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2041       benchmark::utils::CheckXOP);
2042   }
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2043   static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2044     GEMMEnd2EndBenchmark(state, model,
2045       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2046       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2047       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2048       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2049       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2050       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2051       benchmark::utils::CheckXOP);
2052   }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2053   static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2054     GEMMEnd2EndBenchmark(state, model,
2055       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2056       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2057       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2058       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2059       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2060       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2061       benchmark::utils::CheckXOP);
2062   }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2063   static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2064     GEMMEnd2EndBenchmark(state, model,
2065       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2066       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2067       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2068       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2069       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2070       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2071       benchmark::utils::CheckXOP);
2072   }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2073   static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2074     GEMMEnd2EndBenchmark(state, model,
2075       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2076       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2077       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2078       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2079       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2080       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2081       benchmark::utils::CheckXOP);
2082   }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2083   static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2084     GEMMEnd2EndBenchmark(state, model,
2085       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2086       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2087       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2088       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2089       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2090       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2091       benchmark::utils::CheckXOP);
2092   }
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2093   static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2094     GEMMEnd2EndBenchmark(state, model,
2095       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2096       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2097       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2098       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2099       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2100       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2101       benchmark::utils::CheckXOP);
2102   }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2103   static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2104     GEMMEnd2EndBenchmark(state, model,
2105       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2106       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2107       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2108       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2109       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2110       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2111       benchmark::utils::CheckXOP);
2112   }
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2113   static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2114     GEMMEnd2EndBenchmark(state, model,
2115       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2116       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2117       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2118       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2119       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2120       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2121       benchmark::utils::CheckXOP);
2122   }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2123   static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2124     GEMMEnd2EndBenchmark(state, model,
2125       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2126       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2127       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2128       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2129       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2130       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2131       benchmark::utils::CheckXOP);
2132   }
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2133   static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2134     GEMMEnd2EndBenchmark(state, model,
2135       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2136       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2137       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2138       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2139       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2140       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2141       benchmark::utils::CheckAVX);
2142   }
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2143   static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2144     GEMMEnd2EndBenchmark(state, model,
2145       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2146       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2147       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2148       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2149       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2150       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2151       benchmark::utils::CheckAVX);
2152   }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2153   static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2154     GEMMEnd2EndBenchmark(state, model,
2155       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2156       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2157       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2158       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2159       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2160       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2161       benchmark::utils::CheckAVX);
2162   }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2163   static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2164     GEMMEnd2EndBenchmark(state, model,
2165       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2166       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2167       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2168       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2169       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2170       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2171       benchmark::utils::CheckAVX);
2172   }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2173   static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2174     GEMMEnd2EndBenchmark(state, model,
2175       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2176       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2177       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2178       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2179       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2180       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2181       benchmark::utils::CheckAVX);
2182   }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2183   static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2184     GEMMEnd2EndBenchmark(state, model,
2185       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2186       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2187       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2188       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2189       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2190       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2191       benchmark::utils::CheckAVX);
2192   }
2193 
2194 
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2195   static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2196     GEMMEnd2EndBenchmark(state, model,
2197       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2198       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2199       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2200       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2201       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2202       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2203       benchmark::utils::CheckAVX);
2204   }
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2205   static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2206     GEMMEnd2EndBenchmark(state, model,
2207       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2208       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2209       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2210       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2211       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2212       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2213       benchmark::utils::CheckAVX);
2214   }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2215   static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2216     GEMMEnd2EndBenchmark(state, model,
2217       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2218       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2219       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2220       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2221       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2222       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2223       benchmark::utils::CheckAVX);
2224   }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2225   static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2226     GEMMEnd2EndBenchmark(state, model,
2227       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2228       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2229       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2230       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2231       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2232       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2233       benchmark::utils::CheckAVX);
2234   }
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2235   static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2236     GEMMEnd2EndBenchmark(state, model,
2237       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2238       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2239       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2240       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2241       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2242       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2243       benchmark::utils::CheckSSE41);
2244   }
qs8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2245   static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2246     GEMMEnd2EndBenchmark(state, model,
2247       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2248       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2249       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2250       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2251       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2252       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2253       benchmark::utils::CheckSSE41);
2254   }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2255   static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2256     GEMMEnd2EndBenchmark(state, model,
2257       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2258       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2259       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2260       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2261       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2262       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2263       benchmark::utils::CheckSSE41);
2264   }
qs8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2265   static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2266     GEMMEnd2EndBenchmark(state, model,
2267       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2268       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2269       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2270       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2271       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2272       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2273       benchmark::utils::CheckSSE41);
2274   }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2275   static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2276     GEMMEnd2EndBenchmark(state, model,
2277       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2278       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2279       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2280       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2281       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2282       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2283       benchmark::utils::CheckSSE41);
2284   }
qs8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2285   static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2286     GEMMEnd2EndBenchmark(state, model,
2287       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2288       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2289       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2290       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2291       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2292       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2293       benchmark::utils::CheckSSE41);
2294   }
qs8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2295   static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2296     GEMMEnd2EndBenchmark(state, model,
2297       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2298       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2299       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2300       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2301       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2302       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2303       benchmark::utils::CheckSSE41);
2304   }
qs8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2305   static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2306     GEMMEnd2EndBenchmark(state, model,
2307       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2308       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2309       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2310       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2311       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2312       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2313       benchmark::utils::CheckSSE41);
2314   }
qs8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2315   static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2316     GEMMEnd2EndBenchmark(state, model,
2317       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2318       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2319       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2320       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2321       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2322       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2323       benchmark::utils::CheckSSE41);
2324   }
qs8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2325   static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2326     GEMMEnd2EndBenchmark(state, model,
2327       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2328       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2329       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2330       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2331       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2332       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2333       benchmark::utils::CheckSSE41);
2334   }
2335 
2336 
qs8_gemm_2x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2337   static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2338     GEMMEnd2EndBenchmark(state, model,
2339       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2340       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2341       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2342       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2343       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2344       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2345       benchmark::utils::CheckSSSE3);
2346   }
qs8_gemm_2x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2347   static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2348     GEMMEnd2EndBenchmark(state, model,
2349       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2350       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2351       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2352       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2353       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2354       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2355       benchmark::utils::CheckSSSE3);
2356   }
qs8_gemm_3x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2357   static void qs8_gemm_3x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2358     GEMMEnd2EndBenchmark(state, model,
2359       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2360       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2361       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2362       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2363       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2364       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2365       benchmark::utils::CheckSSSE3);
2366   }
qs8_gemm_3x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2367   static void qs8_gemm_3x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2368     GEMMEnd2EndBenchmark(state, model,
2369       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2370       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2371       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2372       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2373       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2374       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2375       benchmark::utils::CheckSSSE3);
2376   }
qs8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2377   static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2378     GEMMEnd2EndBenchmark(state, model,
2379       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2380       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2381       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2382       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2383       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2384       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2385   }
qs8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2386   static void qs8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2387     GEMMEnd2EndBenchmark(state, model,
2388       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2389       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2390       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2391       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2392       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2393       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2394   }
qs8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2395   static void qs8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2396     GEMMEnd2EndBenchmark(state, model,
2397       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2398       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2399       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2400       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2401       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2402       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2403   }
qs8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2404   static void qs8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2405     GEMMEnd2EndBenchmark(state, model,
2406       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2407       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2408       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2409       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2410       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2411       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2412   }
qs8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2413   static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2414     GEMMEnd2EndBenchmark(state, model,
2415       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2416       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2417       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2418       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2419       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2420       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2421   }
qs8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2422   static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2423     GEMMEnd2EndBenchmark(state, model,
2424       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2425       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2426       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2427       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2428       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2429       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2430   }
qs8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2431   static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2432     GEMMEnd2EndBenchmark(state, model,
2433       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2434       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2435       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2436       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2437       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2438       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2439   }
qs8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2440   static void qs8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2441     GEMMEnd2EndBenchmark(state, model,
2442       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2443       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2444       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2445       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2446       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2447       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2448   }
qs8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2449   static void qs8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2450     GEMMEnd2EndBenchmark(state, model,
2451       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2452       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2453       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2454       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2455       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2456       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2457   }
qs8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2458   static void qs8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2459     GEMMEnd2EndBenchmark(state, model,
2460       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2461       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2462       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2463       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2464       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2465       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2466   }
2467 
2468 
2469   BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__avx512skx);
2470   BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__avx512skx);
2471   BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__avx512skx);
2472 
2473   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__avx2);
2474   BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__avx2);
2475 
2476   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld64);
2477   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld128);
2478   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld64);
2479   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld128);
2480   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld64);
2481   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld128);
2482 
2483   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld64);
2484   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld128);
2485   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld64);
2486   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld128);
2487 
2488   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld64);
2489   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld128);
2490   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld64);
2491   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld128);
2492   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld64);
2493   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld128);
2494 
2495   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld64);
2496   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld128);
2497   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld64);
2498   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld128);
2499 
2500   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld64);
2501   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld128);
2502   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld64);
2503   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld128);
2504   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld64);
2505   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld128);
2506 
2507   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld64);
2508   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld128);
2509   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld64);
2510   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld128);
2511 
2512   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld64);
2513   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld128);
2514   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld64);
2515   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld128);
2516 
2517   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld64);
2518   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld128);
2519   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld64);
2520   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld128);
2521   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld64);
2522   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld128);
2523 
2524   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld64);
2525   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld128);
2526   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld64);
2527   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld128);
2528 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2529 
2530 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2531   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2532     GEMMEnd2EndBenchmark(state, model,
2533       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2534       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2535       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2536       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2537       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2538       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2539   }
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2540   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2541     GEMMEnd2EndBenchmark(state, model,
2542       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2543       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2544       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2545       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2546       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2547       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2548   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2549   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2550     GEMMEnd2EndBenchmark(state, model,
2551       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2552       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2553       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2554       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2555       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2556       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2557   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2558   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2559     GEMMEnd2EndBenchmark(state, model,
2560       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2561       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2562       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2563       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2564       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2565       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2566   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2567   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2568     GEMMEnd2EndBenchmark(state, model,
2569       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2570       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2571       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2572       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2573       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2574       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2575   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2576   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2577     GEMMEnd2EndBenchmark(state, model,
2578       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2579       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2580       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2581       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2582       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2583       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2584   }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2585   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2586     GEMMEnd2EndBenchmark(state, model,
2587       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2588       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2589       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2590       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2591       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2592       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2593   }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2594   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2595     GEMMEnd2EndBenchmark(state, model,
2596       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2597       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2598       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2599       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2600       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2601       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2602   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2603   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2604     GEMMEnd2EndBenchmark(state, model,
2605       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2606       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2607       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2608       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2609       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2610       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2611   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2612   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2613     GEMMEnd2EndBenchmark(state, model,
2614       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2615       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2616       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2617       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2618       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2619       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2620   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2621   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2622     GEMMEnd2EndBenchmark(state, model,
2623       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2624       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2625       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2626       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2627       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2628       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2629   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2630   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2631     GEMMEnd2EndBenchmark(state, model,
2632       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2633       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2634       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2635       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2636       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2637       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2638   }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2639   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2640     GEMMEnd2EndBenchmark(state, model,
2641       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2642       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2643       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2644       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2645       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2646       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2647   }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2648   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2649     GEMMEnd2EndBenchmark(state, model,
2650       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2651       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2652       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2653       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2654       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2655       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2656   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2657   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2658     GEMMEnd2EndBenchmark(state, model,
2659       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
2660       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
2661       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2662       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2663       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2664       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2665   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2666   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2667     GEMMEnd2EndBenchmark(state, model,
2668       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
2669       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
2670       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2671       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2672       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2673       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2674   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2675   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2676     GEMMEnd2EndBenchmark(state, model,
2677       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
2678       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
2679       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2680       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2681       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2682       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2683   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2684   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2685     GEMMEnd2EndBenchmark(state, model,
2686       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
2687       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
2688       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2689       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2690       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2691       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2692   }
qs8_gemm_2x4c8__wasmsimd_mul16_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2693   static void qs8_gemm_2x4c8__wasmsimd_mul16_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2694     GEMMEnd2EndBenchmark(state, model,
2695       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64,
2696       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64,
2697       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2698       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2699       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2700       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2701   }
qs8_gemm_2x4c8__wasmsimd_mul16_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2702   static void qs8_gemm_2x4c8__wasmsimd_mul16_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2703     GEMMEnd2EndBenchmark(state, model,
2704       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128,
2705       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128,
2706       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2707       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2708       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2709       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2710   }
qs8_gemm_3x4c8__wasmsimd_mul16_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2711   static void qs8_gemm_3x4c8__wasmsimd_mul16_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2712     GEMMEnd2EndBenchmark(state, model,
2713       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64,
2714       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64,
2715       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2716       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2717       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2718       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2719   }
qs8_gemm_3x4c8__wasmsimd_mul16_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2720   static void qs8_gemm_3x4c8__wasmsimd_mul16_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2721     GEMMEnd2EndBenchmark(state, model,
2722       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128,
2723       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128,
2724       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2725       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2726       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2727       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2728   }
2729 
2730   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)2731   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
2732   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
2733   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
2734   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
2735   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
2736 
2737   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
2738   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
2739   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
2740   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
2741   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
2742   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
2743 
2744   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
2745   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
2746   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
2747   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
2748   BENCHMARK_QS8_END2END(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
2749   BENCHMARK_QS8_END2END(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
2750 
2751   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_mul16_ld64)
2752   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_mul16_ld128)
2753   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_mul16_ld64)
2754   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_mul16_ld128)
2755 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2756 
2757 
2758 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2759   static void qs8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2760     GEMMEnd2EndBenchmark(state, model,
2761       xnn_qs8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
2762       xnn_qs8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
2763       xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2764       xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2765       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2766       2 /* mr */, 2 /* nr */);
2767   }
qs8_gemm_3x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2768   static void qs8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2769     GEMMEnd2EndBenchmark(state, model,
2770       xnn_qs8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
2771       xnn_qs8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
2772       xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2773       xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2774       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2775       3 /* mr */, 2 /* nr */);
2776   }
qs8_gemm_4x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2777   static void qs8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2778     GEMMEnd2EndBenchmark(state, model,
2779       xnn_qs8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
2780       xnn_qs8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
2781       xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2782       xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2783       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2784       4 /* mr */, 2 /* nr */);
2785   }
qs8_gemm_2x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2786   static void qs8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2787     GEMMEnd2EndBenchmark(state, model,
2788       xnn_qs8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
2789       xnn_qs8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
2790       xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2791       xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2792       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2793       2 /* mr */, 4 /* nr */);
2794   }
qs8_gemm_3x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2795   static void qs8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2796     GEMMEnd2EndBenchmark(state, model,
2797       xnn_qs8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
2798       xnn_qs8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
2799       xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2800       xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2801       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2802       3 /* mr */, 4 /* nr */);
2803   }
qs8_gemm_4x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2804   static void qs8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2805     GEMMEnd2EndBenchmark(state, model,
2806       xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
2807       xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
2808       xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2809       xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2810       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2811       4 /* mr */, 4 /* nr */);
2812   }
2813 
2814   BENCHMARK_QS8_END2END(qs8_gemm_2x2__wasm_fmagic)
BENCHMARK_QS8_END2END(qs8_gemm_3x2__wasm_fmagic)2815   BENCHMARK_QS8_END2END(qs8_gemm_3x2__wasm_fmagic)
2816   BENCHMARK_QS8_END2END(qs8_gemm_4x2__wasm_fmagic)
2817   BENCHMARK_QS8_END2END(qs8_gemm_2x4__wasm_fmagic)
2818   BENCHMARK_QS8_END2END(qs8_gemm_3x4__wasm_fmagic)
2819   BENCHMARK_QS8_END2END(qs8_gemm_4x4__wasm_fmagic)
2820 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2821 
2822 
2823 static void qs8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2824   GEMMEnd2EndBenchmark(state, model,
2825     xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
2826     xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
2827     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2828     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2829     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2830     2 /* mr */, 2 /* nr */);
2831 }
qs8_gemm_3x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2832 static void qs8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2833   GEMMEnd2EndBenchmark(state, model,
2834     xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
2835     xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
2836     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2837     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2838     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2839     3 /* mr */, 2 /* nr */);
2840 }
qs8_gemm_4x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2841 static void qs8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2842   GEMMEnd2EndBenchmark(state, model,
2843     xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
2844     xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
2845     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2846     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2847     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2848     4 /* mr */, 2 /* nr */);
2849 }
qs8_gemm_2x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2850 static void qs8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2851   GEMMEnd2EndBenchmark(state, model,
2852     xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
2853     xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
2854     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2855     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2856     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2857     2 /* mr */, 4 /* nr */);
2858 }
qs8_gemm_3x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2859 static void qs8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2860   GEMMEnd2EndBenchmark(state, model,
2861     xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
2862     xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
2863     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2864     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2865     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2866     3 /* mr */, 4 /* nr */);
2867 }
qs8_gemm_4x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2868 static void qs8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2869   GEMMEnd2EndBenchmark(state, model,
2870     xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
2871     xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
2872     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2873     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2874     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2875     4 /* mr */, 4 /* nr */);
2876 }
2877 
qs8_gemm_2x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2878 static void qs8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2879   GEMMEnd2EndBenchmark(state, model,
2880     xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
2881     xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic,
2882     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2883     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2884     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2885     2 /* mr */, 2 /* nr */);
2886 }
qs8_gemm_3x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2887 static void qs8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2888   GEMMEnd2EndBenchmark(state, model,
2889     xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
2890     xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic,
2891     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2892     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2893     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2894     3 /* mr */, 2 /* nr */);
2895 }
qs8_gemm_4x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2896 static void qs8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2897   GEMMEnd2EndBenchmark(state, model,
2898     xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
2899     xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic,
2900     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2901     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2902     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2903     4 /* mr */, 2 /* nr */);
2904 }
qs8_gemm_2x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2905 static void qs8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2906   GEMMEnd2EndBenchmark(state, model,
2907     xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
2908     xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic,
2909     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2910     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2911     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2912     2 /* mr */, 4 /* nr */);
2913 }
qs8_gemm_3x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2914 static void qs8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2915   GEMMEnd2EndBenchmark(state, model,
2916     xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
2917     xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic,
2918     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2919     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2920     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2921     3 /* mr */, 4 /* nr */);
2922 }
qs8_gemm_4x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2923 static void qs8_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2924   GEMMEnd2EndBenchmark(state, model,
2925     xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
2926     xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_imagic,
2927     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2928     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2929     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2930     4 /* mr */, 4 /* nr */);
2931 }
2932 
qs8_gemm_2x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)2933 static void qs8_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
2934   GEMMEnd2EndBenchmark(state, model,
2935     xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
2936     xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
2937     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
2938     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
2939     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
2940     2 /* mr */, 2 /* nr */);
2941 }
qs8_gemm_3x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)2942 static void qs8_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
2943   GEMMEnd2EndBenchmark(state, model,
2944     xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
2945     xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
2946     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
2947     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
2948     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
2949     3 /* mr */, 2 /* nr */);
2950 }
qs8_gemm_4x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)2951 static void qs8_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
2952   GEMMEnd2EndBenchmark(state, model,
2953     xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
2954     xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
2955     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
2956     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
2957     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
2958     4 /* mr */, 2 /* nr */);
2959 }
qs8_gemm_2x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)2960 static void qs8_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
2961   GEMMEnd2EndBenchmark(state, model,
2962     xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
2963     xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
2964     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
2965     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
2966     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
2967     2 /* mr */, 4 /* nr */);
2968 }
qs8_gemm_3x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)2969 static void qs8_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
2970   GEMMEnd2EndBenchmark(state, model,
2971     xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
2972     xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
2973     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
2974     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
2975     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
2976     3 /* mr */, 4 /* nr */);
2977 }
qs8_gemm_4x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)2978 static void qs8_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
2979   GEMMEnd2EndBenchmark(state, model,
2980     xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
2981     xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
2982     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
2983     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
2984     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
2985     4 /* mr */, 4 /* nr */);
2986 }
2987 
2988 BENCHMARK_QS8_END2END(qs8_gemm_2x2__scalar_fmagic)
2989 BENCHMARK_QS8_END2END(qs8_gemm_3x2__scalar_fmagic)
2990 BENCHMARK_QS8_END2END(qs8_gemm_4x2__scalar_fmagic)
2991 BENCHMARK_QS8_END2END(qs8_gemm_2x4__scalar_fmagic)
2992 BENCHMARK_QS8_END2END(qs8_gemm_3x4__scalar_fmagic)
2993 BENCHMARK_QS8_END2END(qs8_gemm_4x4__scalar_fmagic)
2994 
2995 BENCHMARK_QS8_END2END(qs8_gemm_2x2__scalar_imagic)
2996 BENCHMARK_QS8_END2END(qs8_gemm_3x2__scalar_imagic)
2997 BENCHMARK_QS8_END2END(qs8_gemm_4x2__scalar_imagic)
2998 BENCHMARK_QS8_END2END(qs8_gemm_2x4__scalar_imagic)
2999 BENCHMARK_QS8_END2END(qs8_gemm_3x4__scalar_imagic)
3000 BENCHMARK_QS8_END2END(qs8_gemm_4x4__scalar_imagic)
3001 
3002 BENCHMARK_QS8_END2END(qs8_gemm_2x2__scalar_lrintf)
3003 BENCHMARK_QS8_END2END(qs8_gemm_3x2__scalar_lrintf)
3004 BENCHMARK_QS8_END2END(qs8_gemm_4x2__scalar_lrintf)
3005 BENCHMARK_QS8_END2END(qs8_gemm_2x4__scalar_lrintf)
3006 BENCHMARK_QS8_END2END(qs8_gemm_3x4__scalar_lrintf)
3007 BENCHMARK_QS8_END2END(qs8_gemm_4x4__scalar_lrintf)
3008 
3009 #ifndef XNNPACK_BENCHMARK_NO_MAIN
3010 BENCHMARK_MAIN();
3011 #endif
3012