1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <xnnpack.h>
13
14 #include <benchmark/benchmark.h>
15
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/gemm.h>
20 #include <xnnpack/igemm.h>
21 #include <xnnpack/params.h>
22 #include <xnnpack/params-init.h>
23
24
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qs8_gemm_minmax_ukernel_function gemm,xnn_qs8_igemm_minmax_ukernel_function igemm,xnn_qs8_gemm_minmax_ukernel_function gemm1,xnn_qs8_igemm_minmax_ukernel_function igemm1,xnn_init_qs8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void GEMMEnd2EndBenchmark(
26 benchmark::State& state,
27 models::ExecutionPlanFactory model_factory,
28 xnn_qs8_gemm_minmax_ukernel_function gemm,
29 xnn_qs8_igemm_minmax_ukernel_function igemm,
30 xnn_qs8_gemm_minmax_ukernel_function gemm1,
31 xnn_qs8_igemm_minmax_ukernel_function igemm1,
32 xnn_init_qs8_conv_minmax_params_fn init_params,
33 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
34 benchmark::utils::IsaCheckFunction isa_check = nullptr)
35 {
36 if (isa_check && !isa_check(state)) {
37 return;
38 }
39 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
40 state.SkipWithError("failed to initialize XNNPACK");
41 return;
42 }
43
44 // Override microkernels chosen in xnn_initialize
45 // Note: do not directly assign to xnn_params.qs8.gemm because it breaks older gcc.
46 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
47 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
48 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
49 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
50 xnn_params.qs8.gemm.init.qs8 = init_params;
51 xnn_params.qs8.gemm.mr = mr;
52 xnn_params.qs8.gemm.nr = nr;
53 xnn_params.qs8.gemm.log2_kr = log2_kr;
54 xnn_params.qs8.gemm.log2_sr = log2_sr;
55
56 auto execution_plan = model_factory(nullptr);
57 if (execution_plan.empty()) {
58 state.SkipWithError("failed to create a model");
59 return;
60 }
61
62 for (auto _ : state) {
63 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
64 xnn_status status = xnn_run_operator(op.get(), nullptr);
65 if (status != xnn_status_success) {
66 state.SkipWithError("failed to run a model");
67 return;
68 }
69 }
70 }
71
72 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
73 if (cpu_frequency != 0) {
74 state.counters["cpufreq"] = cpu_frequency;
75 }
76 }
77
78 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)79 static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
80 GEMMEnd2EndBenchmark(state, model,
81 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
82 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
83 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
84 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
85 xnn_init_qs8_conv_minmax_rndnu_neon_params,
86 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
87 benchmark::utils::CheckNEONDOT);
88 }
qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)89 static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
90 GEMMEnd2EndBenchmark(state, model,
91 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
92 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
93 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
94 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
95 xnn_init_qs8_conv_minmax_rndnu_neon_params,
96 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
97 benchmark::utils::CheckNEONDOT);
98 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)99 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
100 GEMMEnd2EndBenchmark(state, model,
101 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
102 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
103 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
104 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
105 xnn_init_qs8_conv_minmax_rndnu_neon_params,
106 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
107 benchmark::utils::CheckNEON);
108 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)109 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
110 GEMMEnd2EndBenchmark(state, model,
111 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
112 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
113 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
114 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
115 xnn_init_qs8_conv_minmax_rndnu_neon_params,
116 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
117 benchmark::utils::CheckNEON);
118 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)119 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
120 GEMMEnd2EndBenchmark(state, model,
121 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
122 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
123 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
124 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
125 xnn_init_qs8_conv_minmax_rndnu_neon_params,
126 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
127 benchmark::utils::CheckNEON);
128 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)129 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
130 GEMMEnd2EndBenchmark(state, model,
131 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
132 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
133 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
134 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
135 xnn_init_qs8_conv_minmax_rndnu_neon_params,
136 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
137 benchmark::utils::CheckNEON);
138 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)139 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
140 GEMMEnd2EndBenchmark(state, model,
141 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
142 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
143 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
144 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
145 xnn_init_qs8_conv_minmax_rndnu_neon_params,
146 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
147 benchmark::utils::CheckNEON);
148 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)149 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
150 GEMMEnd2EndBenchmark(state, model,
151 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
152 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
153 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
154 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
155 xnn_init_qs8_conv_minmax_rndnu_neon_params,
156 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
157 benchmark::utils::CheckNEON);
158 }
159 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)160 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)
161 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
162 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
163 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
164 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
165 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
166 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
167 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
168
169 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
170 static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
171 GEMMEnd2EndBenchmark(state, model,
172 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
173 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
174 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
175 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
176 xnn_init_qs8_conv_minmax_rndnu_neon_params,
177 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
178 benchmark::utils::CheckNEONDOT);
179 }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,models::ExecutionPlanFactory model)180 static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
181 GEMMEnd2EndBenchmark(state, model,
182 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32,
183 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
184 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32,
185 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
186 xnn_init_qs8_conv_minmax_rndnu_neon_params,
187 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
188 benchmark::utils::CheckNEONDOT);
189 }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)190 static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
191 GEMMEnd2EndBenchmark(state, model,
192 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
193 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
194 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
195 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
196 xnn_init_qs8_conv_minmax_rndnu_neon_params,
197 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
198 benchmark::utils::CheckNEONDOT);
199 }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)200 static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
201 GEMMEnd2EndBenchmark(state, model,
202 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
203 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
204 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
205 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
206 xnn_init_qs8_conv_minmax_rndnu_neon_params,
207 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
208 benchmark::utils::CheckNEONDOT);
209 }
qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)210 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
211 GEMMEnd2EndBenchmark(state, model,
212 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
213 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
214 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
215 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
216 xnn_init_qs8_conv_minmax_rndnu_neon_params,
217 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
218 benchmark::utils::CheckNEON);
219 }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)220 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
221 GEMMEnd2EndBenchmark(state, model,
222 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
223 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
224 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
225 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
226 xnn_init_qs8_conv_minmax_rndnu_neon_params,
227 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
228 benchmark::utils::CheckNEON);
229 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)230 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
231 GEMMEnd2EndBenchmark(state, model,
232 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
233 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
234 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
235 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
236 xnn_init_qs8_conv_minmax_rndnu_neon_params,
237 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
238 benchmark::utils::CheckNEON);
239 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)240 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
241 GEMMEnd2EndBenchmark(state, model,
242 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
243 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
244 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
245 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
246 xnn_init_qs8_conv_minmax_rndnu_neon_params,
247 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
248 benchmark::utils::CheckNEON);
249 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)250 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
251 GEMMEnd2EndBenchmark(state, model,
252 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
253 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
254 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
255 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
256 xnn_init_qs8_conv_minmax_rndnu_neon_params,
257 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
258 benchmark::utils::CheckNEON);
259 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)260 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
261 GEMMEnd2EndBenchmark(state, model,
262 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
263 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
264 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
265 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
266 xnn_init_qs8_conv_minmax_rndnu_neon_params,
267 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
268 benchmark::utils::CheckNEON);
269 }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)270 static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
271 GEMMEnd2EndBenchmark(state, model,
272 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
273 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
274 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
275 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
276 xnn_init_qs8_conv_minmax_rndnu_neon_params,
277 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
278 benchmark::utils::CheckNEON);
279 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,models::ExecutionPlanFactory model)280 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
281 GEMMEnd2EndBenchmark(state, model,
282 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
283 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
284 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
285 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
286 xnn_init_qs8_conv_minmax_rndnu_neon_params,
287 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
288 benchmark::utils::CheckNEON);
289 }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)290 static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
291 GEMMEnd2EndBenchmark(state, model,
292 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53,
293 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
294 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
295 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
296 xnn_init_qs8_conv_minmax_rndnu_neon_params,
297 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
298 benchmark::utils::CheckNEON);
299 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)300 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
301 GEMMEnd2EndBenchmark(state, model,
302 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
303 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
304 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
305 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
306 xnn_init_qs8_conv_minmax_rndnu_neon_params,
307 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
308 benchmark::utils::CheckNEON);
309 }
310
311 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)312 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)
313 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld64)
314 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
315 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
316 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
317 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
318 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
319 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
320 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
321 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
322 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
323 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
324 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal)
325 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
326
327 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
328 static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
329 GEMMEnd2EndBenchmark(state, model,
330 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
331 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
332 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
333 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
334 xnn_init_qs8_conv_minmax_rndnu_neon_params,
335 2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
336 benchmark::utils::CheckNEON);
337 }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)338 static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
339 GEMMEnd2EndBenchmark(state, model,
340 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
341 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
342 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
343 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
344 xnn_init_qs8_conv_minmax_rndnu_neon_params,
345 2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
346 benchmark::utils::CheckNEON);
347 }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)348 static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
349 GEMMEnd2EndBenchmark(state, model,
350 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
351 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
352 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
353 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
354 xnn_init_qs8_conv_minmax_rndnu_neon_params,
355 3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
356 benchmark::utils::CheckNEON);
357 }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)358 static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
359 GEMMEnd2EndBenchmark(state, model,
360 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
361 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
362 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
363 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
364 xnn_init_qs8_conv_minmax_rndnu_neon_params,
365 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
366 benchmark::utils::CheckNEON);
367 }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)368 static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
369 GEMMEnd2EndBenchmark(state, model,
370 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
371 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
372 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
373 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
374 xnn_init_qs8_conv_minmax_rndnu_neon_params,
375 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
376 benchmark::utils::CheckNEON);
377 }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)378 static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
379 GEMMEnd2EndBenchmark(state, model,
380 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
381 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
382 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
383 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
384 xnn_init_qs8_conv_minmax_rndnu_neon_params,
385 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
386 benchmark::utils::CheckNEON);
387 }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)388 static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
389 GEMMEnd2EndBenchmark(state, model,
390 xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
391 xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
392 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
393 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
394 xnn_init_qs8_conv_minmax_rndnu_neon_params,
395 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
396 benchmark::utils::CheckNEON);
397 }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)398 static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
399 GEMMEnd2EndBenchmark(state, model,
400 xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
401 xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
402 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
403 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
404 xnn_init_qs8_conv_minmax_rndnu_neon_params,
405 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
406 benchmark::utils::CheckNEON);
407 }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)408 static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
409 GEMMEnd2EndBenchmark(state, model,
410 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
411 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
412 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
413 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
414 xnn_init_qs8_conv_minmax_rndnu_neon_params,
415 2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
416 benchmark::utils::CheckNEON);
417 }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)418 static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
419 GEMMEnd2EndBenchmark(state, model,
420 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
421 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
422 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
423 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
424 xnn_init_qs8_conv_minmax_rndnu_neon_params,
425 2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
426 benchmark::utils::CheckNEON);
427 }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)428 static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
429 GEMMEnd2EndBenchmark(state, model,
430 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
431 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
432 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
433 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
434 xnn_init_qs8_conv_minmax_rndnu_neon_params,
435 3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
436 benchmark::utils::CheckNEON);
437 }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)438 static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
439 GEMMEnd2EndBenchmark(state, model,
440 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
441 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
442 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
443 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
444 xnn_init_qs8_conv_minmax_rndnu_neon_params,
445 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
446 benchmark::utils::CheckNEON);
447 }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)448 static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
449 GEMMEnd2EndBenchmark(state, model,
450 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
451 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
452 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
453 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
454 xnn_init_qs8_conv_minmax_rndnu_neon_params,
455 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
456 benchmark::utils::CheckNEON);
457 }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)458 static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
459 GEMMEnd2EndBenchmark(state, model,
460 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
461 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
462 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
463 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
464 xnn_init_qs8_conv_minmax_rndnu_neon_params,
465 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
466 benchmark::utils::CheckNEON);
467 }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)468 static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
469 GEMMEnd2EndBenchmark(state, model,
470 xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
471 xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
472 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
473 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
474 xnn_init_qs8_conv_minmax_rndnu_neon_params,
475 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
476 benchmark::utils::CheckNEON);
477 }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)478 static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
479 GEMMEnd2EndBenchmark(state, model,
480 xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
481 xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
482 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
483 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
484 xnn_init_qs8_conv_minmax_rndnu_neon_params,
485 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
486 benchmark::utils::CheckNEON);
487 }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)488 static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
489 GEMMEnd2EndBenchmark(state, model,
490 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
491 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
492 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
493 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
494 xnn_init_qs8_conv_minmax_rndnu_neon_params,
495 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
496 benchmark::utils::CheckNEON);
497 }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)498 static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
499 GEMMEnd2EndBenchmark(state, model,
500 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
501 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
502 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
503 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
504 xnn_init_qs8_conv_minmax_rndnu_neon_params,
505 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
506 benchmark::utils::CheckNEON);
507 }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)508 static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
509 GEMMEnd2EndBenchmark(state, model,
510 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
511 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
512 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
513 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
514 xnn_init_qs8_conv_minmax_rndnu_neon_params,
515 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
516 benchmark::utils::CheckNEON);
517 }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)518 static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
519 GEMMEnd2EndBenchmark(state, model,
520 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
521 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
522 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
523 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
524 xnn_init_qs8_conv_minmax_rndnu_neon_params,
525 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
526 benchmark::utils::CheckNEON);
527 }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)528 static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
529 GEMMEnd2EndBenchmark(state, model,
530 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
531 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
532 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
533 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
534 xnn_init_qs8_conv_minmax_rndnu_neon_params,
535 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
536 benchmark::utils::CheckNEON);
537 }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)538 static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
539 GEMMEnd2EndBenchmark(state, model,
540 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
541 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
542 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
543 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
544 xnn_init_qs8_conv_minmax_rndnu_neon_params,
545 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
546 benchmark::utils::CheckNEON);
547 }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)548 static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
549 GEMMEnd2EndBenchmark(state, model,
550 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
551 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
552 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
553 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
554 xnn_init_qs8_conv_minmax_rndnu_neon_params,
555 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
556 benchmark::utils::CheckNEON);
557 }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)558 static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
559 GEMMEnd2EndBenchmark(state, model,
560 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
561 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
562 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
563 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
564 xnn_init_qs8_conv_minmax_rndnu_neon_params,
565 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
566 benchmark::utils::CheckNEON);
567 }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)568 static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
569 GEMMEnd2EndBenchmark(state, model,
570 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
571 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
572 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
573 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
574 xnn_init_qs8_conv_minmax_rndnu_neon_params,
575 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
576 benchmark::utils::CheckNEON);
577 }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)578 static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
579 GEMMEnd2EndBenchmark(state, model,
580 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
581 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
582 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
583 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
584 xnn_init_qs8_conv_minmax_rndnu_neon_params,
585 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
586 benchmark::utils::CheckNEON);
587 }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)588 static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
589 GEMMEnd2EndBenchmark(state, model,
590 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
591 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
592 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
593 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
594 xnn_init_qs8_conv_minmax_rndnu_neon_params,
595 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
596 benchmark::utils::CheckNEON);
597 }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)598 static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
599 GEMMEnd2EndBenchmark(state, model,
600 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
601 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
602 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
603 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
604 xnn_init_qs8_conv_minmax_rndnu_neon_params,
605 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
606 benchmark::utils::CheckNEON);
607 }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)608 static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
609 GEMMEnd2EndBenchmark(state, model,
610 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
611 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
612 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
613 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
614 xnn_init_qs8_conv_minmax_rndnu_neon_params,
615 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
616 benchmark::utils::CheckNEON);
617 }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)618 static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
619 GEMMEnd2EndBenchmark(state, model,
620 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
621 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
622 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
623 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
624 xnn_init_qs8_conv_minmax_rndnu_neon_params,
625 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
626 benchmark::utils::CheckNEON);
627 }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)628 static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
629 GEMMEnd2EndBenchmark(state, model,
630 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
631 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
632 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
633 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
634 xnn_init_qs8_conv_minmax_rndnu_neon_params,
635 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
636 benchmark::utils::CheckNEON);
637 }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)638 static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
639 GEMMEnd2EndBenchmark(state, model,
640 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
641 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
642 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
643 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
644 xnn_init_qs8_conv_minmax_rndnu_neon_params,
645 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
646 benchmark::utils::CheckNEON);
647 }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)648 static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
649 GEMMEnd2EndBenchmark(state, model,
650 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
651 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
652 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
653 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
654 xnn_init_qs8_conv_minmax_rndnu_neon_params,
655 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
656 benchmark::utils::CheckNEON);
657 }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)658 static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
659 GEMMEnd2EndBenchmark(state, model,
660 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
661 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
662 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
663 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
664 xnn_init_qs8_conv_minmax_rndnu_neon_params,
665 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
666 benchmark::utils::CheckNEON);
667 }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)668 static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
669 GEMMEnd2EndBenchmark(state, model,
670 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
671 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
672 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
673 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
674 xnn_init_qs8_conv_minmax_rndnu_neon_params,
675 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
676 benchmark::utils::CheckNEON);
677 }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)678 static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
679 GEMMEnd2EndBenchmark(state, model,
680 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
681 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
682 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
683 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
684 xnn_init_qs8_conv_minmax_rndnu_neon_params,
685 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
686 benchmark::utils::CheckNEON);
687 }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)688 static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
689 GEMMEnd2EndBenchmark(state, model,
690 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
691 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
692 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
693 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
694 xnn_init_qs8_conv_minmax_rndnu_neon_params,
695 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
696 benchmark::utils::CheckNEON);
697 }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)698 static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
699 GEMMEnd2EndBenchmark(state, model,
700 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
701 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
702 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
703 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
704 xnn_init_qs8_conv_minmax_rndnu_neon_params,
705 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
706 benchmark::utils::CheckNEON);
707 }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)708 static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
709 GEMMEnd2EndBenchmark(state, model,
710 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
711 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
712 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
713 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
714 xnn_init_qs8_conv_minmax_rndnu_neon_params,
715 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
716 benchmark::utils::CheckNEON);
717 }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)718 static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
719 GEMMEnd2EndBenchmark(state, model,
720 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
721 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
722 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
723 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
724 xnn_init_qs8_conv_minmax_rndnu_neon_params,
725 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
726 benchmark::utils::CheckNEON);
727 }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)728 static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
729 GEMMEnd2EndBenchmark(state, model,
730 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
731 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
732 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
733 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
734 xnn_init_qs8_conv_minmax_rndnu_neon_params,
735 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
736 benchmark::utils::CheckNEON);
737 }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)738 static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
739 GEMMEnd2EndBenchmark(state, model,
740 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
741 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
742 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
743 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
744 xnn_init_qs8_conv_minmax_rndnu_neon_params,
745 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
746 benchmark::utils::CheckNEON);
747 }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)748 static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
749 GEMMEnd2EndBenchmark(state, model,
750 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
751 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
752 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
753 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
754 xnn_init_qs8_conv_minmax_rndnu_neon_params,
755 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
756 benchmark::utils::CheckNEON);
757 }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)758 static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
759 GEMMEnd2EndBenchmark(state, model,
760 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
761 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
762 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
763 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
764 xnn_init_qs8_conv_minmax_rndnu_neon_params,
765 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
766 benchmark::utils::CheckNEON);
767 }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)768 static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
769 GEMMEnd2EndBenchmark(state, model,
770 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
771 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
772 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
773 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
774 xnn_init_qs8_conv_minmax_rndnu_neon_params,
775 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
776 benchmark::utils::CheckNEON);
777 }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)778 static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
779 GEMMEnd2EndBenchmark(state, model,
780 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
781 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
782 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
783 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
784 xnn_init_qs8_conv_minmax_rndnu_neon_params,
785 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
786 benchmark::utils::CheckNEON);
787 }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)788 static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
789 GEMMEnd2EndBenchmark(state, model,
790 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
791 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
792 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
793 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
794 xnn_init_qs8_conv_minmax_rndnu_neon_params,
795 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
796 benchmark::utils::CheckNEON);
797 }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)798 static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
799 GEMMEnd2EndBenchmark(state, model,
800 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
801 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
802 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
803 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
804 xnn_init_qs8_conv_minmax_rndnu_neon_params,
805 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
806 benchmark::utils::CheckNEON);
807 }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)808 static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
809 GEMMEnd2EndBenchmark(state, model,
810 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
811 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
812 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
813 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
814 xnn_init_qs8_conv_minmax_rndnu_neon_params,
815 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
816 benchmark::utils::CheckNEON);
817 }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)818 static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
819 GEMMEnd2EndBenchmark(state, model,
820 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
821 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
822 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
823 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
824 xnn_init_qs8_conv_minmax_rndnu_neon_params,
825 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
826 benchmark::utils::CheckNEON);
827 }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)828 static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
829 GEMMEnd2EndBenchmark(state, model,
830 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
831 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
832 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
833 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
834 xnn_init_qs8_conv_minmax_rndnu_neon_params,
835 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
836 benchmark::utils::CheckNEON);
837 }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)838 static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
839 GEMMEnd2EndBenchmark(state, model,
840 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
841 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
842 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
843 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
844 xnn_init_qs8_conv_minmax_rndnu_neon_params,
845 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
846 benchmark::utils::CheckNEON);
847 }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)848 static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
849 GEMMEnd2EndBenchmark(state, model,
850 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
851 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
852 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
853 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
854 xnn_init_qs8_conv_minmax_rndnu_neon_params,
855 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
856 benchmark::utils::CheckNEON);
857 }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)858 static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
859 GEMMEnd2EndBenchmark(state, model,
860 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
861 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
862 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
863 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
864 xnn_init_qs8_conv_minmax_rndnu_neon_params,
865 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
866 benchmark::utils::CheckNEON);
867 }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)868 static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
869 GEMMEnd2EndBenchmark(state, model,
870 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
871 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
872 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
873 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
874 xnn_init_qs8_conv_minmax_rndnu_neon_params,
875 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
876 benchmark::utils::CheckNEON);
877 }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)878 static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
879 GEMMEnd2EndBenchmark(state, model,
880 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
881 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
882 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
883 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
884 xnn_init_qs8_conv_minmax_rndnu_neon_params,
885 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
886 benchmark::utils::CheckNEON);
887 }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)888 static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
889 GEMMEnd2EndBenchmark(state, model,
890 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
891 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
892 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
893 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
894 xnn_init_qs8_conv_minmax_rndnu_neon_params,
895 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
896 benchmark::utils::CheckNEON);
897 }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)898 static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
899 GEMMEnd2EndBenchmark(state, model,
900 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
901 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
902 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
903 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
904 xnn_init_qs8_conv_minmax_rndnu_neon_params,
905 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
906 benchmark::utils::CheckNEON);
907 }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)908 static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
909 GEMMEnd2EndBenchmark(state, model,
910 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
911 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
912 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
913 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
914 xnn_init_qs8_conv_minmax_rndnu_neon_params,
915 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
916 benchmark::utils::CheckNEON);
917 }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)918 static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
919 GEMMEnd2EndBenchmark(state, model,
920 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
921 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
922 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
923 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
924 xnn_init_qs8_conv_minmax_rndnu_neon_params,
925 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
926 benchmark::utils::CheckNEON);
927 }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)928 static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
929 GEMMEnd2EndBenchmark(state, model,
930 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
931 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
932 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
933 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
934 xnn_init_qs8_conv_minmax_rndnu_neon_params,
935 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
936 benchmark::utils::CheckNEON);
937 }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)938 static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
939 GEMMEnd2EndBenchmark(state, model,
940 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
941 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
942 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
943 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
944 xnn_init_qs8_conv_minmax_rndnu_neon_params,
945 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
946 benchmark::utils::CheckNEON);
947 }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)948 static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
949 GEMMEnd2EndBenchmark(state, model,
950 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
951 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
952 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
953 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
954 xnn_init_qs8_conv_minmax_rndnu_neon_params,
955 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
956 benchmark::utils::CheckNEON);
957 }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)958 static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
959 GEMMEnd2EndBenchmark(state, model,
960 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
961 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
962 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
963 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
964 xnn_init_qs8_conv_minmax_rndnu_neon_params,
965 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
966 benchmark::utils::CheckNEON);
967 }
qs8_gemm_2x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)968 static void qs8_gemm_2x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
969 GEMMEnd2EndBenchmark(state, model,
970 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
971 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
972 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
973 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
974 xnn_init_qs8_conv_minmax_rndnu_neon_params,
975 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
976 benchmark::utils::CheckNEON);
977 }
qs8_gemm_2x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)978 static void qs8_gemm_2x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
979 GEMMEnd2EndBenchmark(state, model,
980 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
981 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
982 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
983 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
984 xnn_init_qs8_conv_minmax_rndnu_neon_params,
985 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
986 benchmark::utils::CheckNEON);
987 }
qs8_gemm_3x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)988 static void qs8_gemm_3x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
989 GEMMEnd2EndBenchmark(state, model,
990 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
991 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
992 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
993 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
994 xnn_init_qs8_conv_minmax_rndnu_neon_params,
995 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
996 benchmark::utils::CheckNEON);
997 }
qs8_gemm_3x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)998 static void qs8_gemm_3x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
999 GEMMEnd2EndBenchmark(state, model,
1000 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1001 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1002 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1003 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1004 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1005 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1006 benchmark::utils::CheckNEON);
1007 }
qs8_gemm_4x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1008 static void qs8_gemm_4x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1009 GEMMEnd2EndBenchmark(state, model,
1010 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1011 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1012 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1013 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1014 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1015 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1016 benchmark::utils::CheckNEON);
1017 }
qs8_gemm_4x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1018 static void qs8_gemm_4x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1019 GEMMEnd2EndBenchmark(state, model,
1020 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1021 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1022 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1023 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1024 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1025 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1026 benchmark::utils::CheckNEON);
1027 }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1028 static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1029 GEMMEnd2EndBenchmark(state, model,
1030 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1031 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1032 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1033 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1034 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1035 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1036 benchmark::utils::CheckNEON);
1037 }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1038 static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1039 GEMMEnd2EndBenchmark(state, model,
1040 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1041 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1042 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1043 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1044 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1045 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1046 benchmark::utils::CheckNEON);
1047 }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1048 static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1049 GEMMEnd2EndBenchmark(state, model,
1050 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1051 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1052 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1053 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1054 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1055 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1056 benchmark::utils::CheckNEON);
1057 }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1058 static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1059 GEMMEnd2EndBenchmark(state, model,
1060 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1061 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1062 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1063 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1064 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1065 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1066 benchmark::utils::CheckNEON);
1067 }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1068 static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1069 GEMMEnd2EndBenchmark(state, model,
1070 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1071 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1072 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1073 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1074 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1075 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1076 benchmark::utils::CheckNEON);
1077 }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1078 static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1079 GEMMEnd2EndBenchmark(state, model,
1080 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1081 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1082 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1083 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1084 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1085 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1086 benchmark::utils::CheckNEON);
1087 }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1088 static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1089 GEMMEnd2EndBenchmark(state, model,
1090 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1091 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1092 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1093 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1094 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1095 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1096 benchmark::utils::CheckNEON);
1097 }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1098 static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1099 GEMMEnd2EndBenchmark(state, model,
1100 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1101 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1102 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1103 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1104 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1105 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1106 benchmark::utils::CheckNEON);
1107 }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1108 static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1109 GEMMEnd2EndBenchmark(state, model,
1110 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1111 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1112 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1113 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1114 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1115 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1116 benchmark::utils::CheckNEON);
1117 }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1118 static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1119 GEMMEnd2EndBenchmark(state, model,
1120 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1121 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1122 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1123 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1124 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1125 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1126 benchmark::utils::CheckNEON);
1127 }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1128 static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1129 GEMMEnd2EndBenchmark(state, model,
1130 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1131 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1132 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1133 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1134 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1135 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1136 benchmark::utils::CheckNEON);
1137 }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1138 static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1139 GEMMEnd2EndBenchmark(state, model,
1140 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1141 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1142 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1143 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1144 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1145 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1146 benchmark::utils::CheckNEON);
1147 }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1148 static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1149 GEMMEnd2EndBenchmark(state, model,
1150 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1151 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1152 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1153 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1154 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1155 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1156 benchmark::utils::CheckNEON);
1157 }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1158 static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1159 GEMMEnd2EndBenchmark(state, model,
1160 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1161 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1162 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1163 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1164 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1165 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1166 benchmark::utils::CheckNEON);
1167 }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1168 static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1169 GEMMEnd2EndBenchmark(state, model,
1170 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1171 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1172 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1173 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1174 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1175 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1176 benchmark::utils::CheckNEON);
1177 }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1178 static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1179 GEMMEnd2EndBenchmark(state, model,
1180 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1181 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1182 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1183 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1184 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1185 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1186 benchmark::utils::CheckNEON);
1187 }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1188 static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1189 GEMMEnd2EndBenchmark(state, model,
1190 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1191 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1192 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1193 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1194 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1195 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1196 benchmark::utils::CheckNEON);
1197 }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1198 static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1199 GEMMEnd2EndBenchmark(state, model,
1200 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1201 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1202 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1203 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1204 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1205 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1206 benchmark::utils::CheckNEON);
1207 }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1208 static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1209 GEMMEnd2EndBenchmark(state, model,
1210 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1211 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1212 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1213 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1214 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1215 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1216 benchmark::utils::CheckNEON);
1217 }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1218 static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1219 GEMMEnd2EndBenchmark(state, model,
1220 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1221 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1222 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1223 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1224 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1225 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1226 benchmark::utils::CheckNEON);
1227 }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1228 static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1229 GEMMEnd2EndBenchmark(state, model,
1230 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1231 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1232 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1233 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1234 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1235 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1236 benchmark::utils::CheckNEON);
1237 }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1238 static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1239 GEMMEnd2EndBenchmark(state, model,
1240 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1241 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1242 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1243 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1244 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1245 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1246 benchmark::utils::CheckNEON);
1247 }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1248 static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1249 GEMMEnd2EndBenchmark(state, model,
1250 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1251 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1252 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1253 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1254 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1255 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1256 benchmark::utils::CheckNEON);
1257 }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1258 static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1259 GEMMEnd2EndBenchmark(state, model,
1260 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1261 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1262 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1263 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1264 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1265 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1266 benchmark::utils::CheckNEON);
1267 }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1268 static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1269 GEMMEnd2EndBenchmark(state, model,
1270 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1271 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1272 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1273 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1274 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1275 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1276 benchmark::utils::CheckNEON);
1277 }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1278 static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1279 GEMMEnd2EndBenchmark(state, model,
1280 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1281 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1282 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1283 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1284 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1285 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1286 benchmark::utils::CheckNEON);
1287 }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1288 static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1289 GEMMEnd2EndBenchmark(state, model,
1290 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1291 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1292 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1293 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1294 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1295 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1296 benchmark::utils::CheckNEON);
1297 }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1298 static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1299 GEMMEnd2EndBenchmark(state, model,
1300 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1301 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1302 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1303 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1304 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1305 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1306 benchmark::utils::CheckNEON);
1307 }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1308 static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1309 GEMMEnd2EndBenchmark(state, model,
1310 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1311 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1312 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1313 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1314 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1315 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1316 benchmark::utils::CheckNEON);
1317 }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1318 static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1319 GEMMEnd2EndBenchmark(state, model,
1320 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1321 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1322 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1323 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1324 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1325 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1326 benchmark::utils::CheckNEON);
1327 }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1328 static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1329 GEMMEnd2EndBenchmark(state, model,
1330 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1331 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1332 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1333 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1334 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1335 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1336 benchmark::utils::CheckNEON);
1337 }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1338 static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1339 GEMMEnd2EndBenchmark(state, model,
1340 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1341 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1342 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1343 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1344 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1345 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1346 benchmark::utils::CheckNEON);
1347 }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1348 static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1349 GEMMEnd2EndBenchmark(state, model,
1350 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1351 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1352 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1353 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1354 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1355 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1356 benchmark::utils::CheckNEON);
1357 }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1358 static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1359 GEMMEnd2EndBenchmark(state, model,
1360 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1361 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1362 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1363 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1364 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1365 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1366 benchmark::utils::CheckNEON);
1367 }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1368 static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1369 GEMMEnd2EndBenchmark(state, model,
1370 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1371 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1372 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1373 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1374 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1375 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1376 benchmark::utils::CheckNEON);
1377 }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1378 static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1379 GEMMEnd2EndBenchmark(state, model,
1380 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1381 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1382 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1383 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1384 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1385 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1386 benchmark::utils::CheckNEON);
1387 }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1388 static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1389 GEMMEnd2EndBenchmark(state, model,
1390 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1391 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1392 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1393 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1394 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1395 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1396 benchmark::utils::CheckNEON);
1397 }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1398 static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1399 GEMMEnd2EndBenchmark(state, model,
1400 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1401 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1402 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1403 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1404 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1405 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1406 benchmark::utils::CheckNEON);
1407 }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1408 static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1409 GEMMEnd2EndBenchmark(state, model,
1410 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1411 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1412 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1413 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1414 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1415 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1416 benchmark::utils::CheckNEON);
1417 }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1418 static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1419 GEMMEnd2EndBenchmark(state, model,
1420 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1421 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1422 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1423 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1424 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1425 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1426 benchmark::utils::CheckNEON);
1427 }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1428 static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1429 GEMMEnd2EndBenchmark(state, model,
1430 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1431 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1432 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1433 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1434 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1435 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1436 benchmark::utils::CheckNEON);
1437 }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1438 static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1439 GEMMEnd2EndBenchmark(state, model,
1440 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1441 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1442 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1443 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1444 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1445 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1446 benchmark::utils::CheckNEON);
1447 }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1448 static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1449 GEMMEnd2EndBenchmark(state, model,
1450 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1451 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1452 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1453 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1454 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1455 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1456 benchmark::utils::CheckNEON);
1457 }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1458 static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1459 GEMMEnd2EndBenchmark(state, model,
1460 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1461 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1462 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1463 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1464 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1465 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1466 benchmark::utils::CheckNEON);
1467 }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1468 static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1469 GEMMEnd2EndBenchmark(state, model,
1470 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1471 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1472 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1473 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1474 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1475 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1476 benchmark::utils::CheckNEON);
1477 }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1478 static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1479 GEMMEnd2EndBenchmark(state, model,
1480 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1481 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1482 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1483 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1484 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1485 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1486 benchmark::utils::CheckNEON);
1487 }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1488 static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1489 GEMMEnd2EndBenchmark(state, model,
1490 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1491 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1492 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1493 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1494 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1495 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1496 benchmark::utils::CheckNEON);
1497 }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1498 static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1499 GEMMEnd2EndBenchmark(state, model,
1500 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1501 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1502 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1503 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1504 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1505 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1506 benchmark::utils::CheckNEON);
1507 }
qs8_gemm_2x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1508 static void qs8_gemm_2x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1509 GEMMEnd2EndBenchmark(state, model,
1510 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1511 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1512 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1513 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1514 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1515 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1516 benchmark::utils::CheckNEON);
1517 }
qs8_gemm_2x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1518 static void qs8_gemm_2x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1519 GEMMEnd2EndBenchmark(state, model,
1520 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1521 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1522 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1523 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1524 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1525 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1526 benchmark::utils::CheckNEON);
1527 }
qs8_gemm_3x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1528 static void qs8_gemm_3x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1529 GEMMEnd2EndBenchmark(state, model,
1530 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1531 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1532 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1533 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1534 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1535 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1536 benchmark::utils::CheckNEON);
1537 }
qs8_gemm_3x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1538 static void qs8_gemm_3x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1539 GEMMEnd2EndBenchmark(state, model,
1540 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1541 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1542 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1543 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1544 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1545 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1546 benchmark::utils::CheckNEON);
1547 }
qs8_gemm_4x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1548 static void qs8_gemm_4x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1549 GEMMEnd2EndBenchmark(state, model,
1550 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1551 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1552 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1553 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1554 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1555 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1556 benchmark::utils::CheckNEON);
1557 }
qs8_gemm_4x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1558 static void qs8_gemm_4x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1559 GEMMEnd2EndBenchmark(state, model,
1560 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1561 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1562 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1563 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1564 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1565 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1566 benchmark::utils::CheckNEON);
1567 }
qs8_gemm_4x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1568 static void qs8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1569 GEMMEnd2EndBenchmark(state, model,
1570 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
1571 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
1572 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
1573 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
1574 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1575 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1576 benchmark::utils::CheckNEONDOT);
1577 }
qs8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1578 static void qs8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1579 GEMMEnd2EndBenchmark(state, model,
1580 xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
1581 xnn_qs8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
1582 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
1583 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
1584 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1585 6 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1586 benchmark::utils::CheckNEONDOT);
1587 }
qs8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1588 static void qs8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1589 GEMMEnd2EndBenchmark(state, model,
1590 xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
1591 xnn_qs8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
1592 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
1593 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
1594 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1595 8 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1596 benchmark::utils::CheckNEONDOT);
1597 }
qs8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1598 static void qs8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1599 GEMMEnd2EndBenchmark(state, model,
1600 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
1601 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
1602 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
1603 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
1604 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1605 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1606 benchmark::utils::CheckNEONDOT);
1607 }
qs8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1608 static void qs8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1609 GEMMEnd2EndBenchmark(state, model,
1610 xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
1611 xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
1612 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
1613 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
1614 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1615 6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1616 benchmark::utils::CheckNEONDOT);
1617 }
qs8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)1618 static void qs8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
1619 GEMMEnd2EndBenchmark(state, model,
1620 xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
1621 xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
1622 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
1623 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
1624 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1625 8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1626 benchmark::utils::CheckNEONDOT);
1627 }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1628 static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1629 GEMMEnd2EndBenchmark(state, model,
1630 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1631 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1632 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1633 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1634 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1635 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1636 benchmark::utils::CheckNEON);
1637 }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1638 static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1639 GEMMEnd2EndBenchmark(state, model,
1640 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1641 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1642 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1643 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1644 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1645 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1646 benchmark::utils::CheckNEON);
1647 }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1648 static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1649 GEMMEnd2EndBenchmark(state, model,
1650 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1651 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1652 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1653 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1654 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1655 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1656 benchmark::utils::CheckNEON);
1657 }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1658 static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1659 GEMMEnd2EndBenchmark(state, model,
1660 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1661 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1662 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1663 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1664 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1665 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1666 benchmark::utils::CheckNEON);
1667 }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1668 static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1669 GEMMEnd2EndBenchmark(state, model,
1670 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1671 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1672 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1673 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1674 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1675 4 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1676 benchmark::utils::CheckNEON);
1677 }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1678 static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1679 GEMMEnd2EndBenchmark(state, model,
1680 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1681 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1682 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1683 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1684 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1685 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1686 benchmark::utils::CheckNEON);
1687 }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1688 static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1689 GEMMEnd2EndBenchmark(state, model,
1690 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1691 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1692 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1693 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1694 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1695 2 /* mr */, 8 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1696 benchmark::utils::CheckNEON);
1697 }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1698 static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1699 GEMMEnd2EndBenchmark(state, model,
1700 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1701 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1702 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1703 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1704 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1705 2 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1706 benchmark::utils::CheckNEON);
1707 }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1708 static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1709 GEMMEnd2EndBenchmark(state, model,
1710 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1711 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1712 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1713 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1714 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1715 4 /* mr */, 8 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1716 benchmark::utils::CheckNEON);
1717 }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1718 static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1719 GEMMEnd2EndBenchmark(state, model,
1720 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1721 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1722 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1723 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1724 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1725 4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1726 benchmark::utils::CheckNEON);
1727 }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1728 static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1729 GEMMEnd2EndBenchmark(state, model,
1730 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1731 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1732 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1733 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1734 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1735 4 /* mr */, 8 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1736 benchmark::utils::CheckNEON);
1737 }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1738 static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1739 GEMMEnd2EndBenchmark(state, model,
1740 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1741 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1742 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1743 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1744 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1745 4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1746 benchmark::utils::CheckNEON);
1747 }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1748 static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1749 GEMMEnd2EndBenchmark(state, model,
1750 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1751 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1752 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1753 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1754 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1755 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1756 benchmark::utils::CheckNEON);
1757 }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1758 static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1759 GEMMEnd2EndBenchmark(state, model,
1760 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1761 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1762 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1763 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1764 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1765 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1766 benchmark::utils::CheckNEON);
1767 }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1768 static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1769 GEMMEnd2EndBenchmark(state, model,
1770 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1771 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1772 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1773 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1774 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1775 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1776 benchmark::utils::CheckNEON);
1777 }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1778 static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1779 GEMMEnd2EndBenchmark(state, model,
1780 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1781 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1782 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1783 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1784 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1785 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1786 benchmark::utils::CheckNEON);
1787 }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1788 static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1789 GEMMEnd2EndBenchmark(state, model,
1790 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1791 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1792 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1793 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1794 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1795 4 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1796 benchmark::utils::CheckNEON);
1797 }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1798 static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1799 GEMMEnd2EndBenchmark(state, model,
1800 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1801 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1802 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1803 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1804 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1805 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1806 benchmark::utils::CheckNEON);
1807 }
1808
1809 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neondot);
1810 BENCHMARK_QS8_END2END(qs8_gemm_6x8c4__neondot);
1811 BENCHMARK_QS8_END2END(qs8_gemm_8x8c4__neondot);
1812 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neondot);
1813 BENCHMARK_QS8_END2END(qs8_gemm_6x16c4__neondot);
1814 BENCHMARK_QS8_END2END(qs8_gemm_8x16c4__neondot);
1815
1816 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mlal);
1817 BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mlal);
1818 BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mlal);
1819 BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mlal);
1820 BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mlal);
1821 BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mlal);
1822
1823 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mull);
1824 BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mull);
1825 BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mull);
1826 BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mull);
1827 BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mull);
1828 BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mull);
1829
1830 BENCHMARK_QS8_END2END(qs8_gemm_2x8c16__neon_mlal);
1831 BENCHMARK_QS8_END2END(qs8_gemm_2x16c16__neon_mlal);
1832 BENCHMARK_QS8_END2END(qs8_gemm_3x8c16__neon_mlal);
1833 BENCHMARK_QS8_END2END(qs8_gemm_3x16c16__neon_mlal);
1834 BENCHMARK_QS8_END2END(qs8_gemm_4x8c16__neon_mlal);
1835 BENCHMARK_QS8_END2END(qs8_gemm_4x16c16__neon_mlal);
1836
1837 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_dup);
1838 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_dup);
1839 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_dup);
1840 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_dup);
1841 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_dup);
1842 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_dup);
1843
1844 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_dup);
1845 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_dup);
1846 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_dup);
1847 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_dup);
1848 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_dup);
1849 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_dup);
1850
1851 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld1r);
1852 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld1r);
1853 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld1r);
1854 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld1r);
1855 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld1r);
1856 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld1r);
1857
1858 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld1r);
1859 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld1r);
1860 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld1r);
1861 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld1r);
1862 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld1r);
1863 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld1r);
1864
1865 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld2r);
1866 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld2r);
1867 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld2r);
1868 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld2r);
1869 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld2r);
1870 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld2r);
1871
1872 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld2r);
1873 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld2r);
1874 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld2r);
1875 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld2r);
1876 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld2r);
1877 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld2r);
1878
1879 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mlal);
1880 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mlal);
1881 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mlal);
1882 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mlal);
1883 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mlal);
1884 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mlal);
1885
1886 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mull);
1887 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mull);
1888 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mull);
1889 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mull);
1890 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mull);
1891 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mull);
1892
1893 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_dup);
1894 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_dup);
1895 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_dup);
1896 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_dup);
1897 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_dup);
1898 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_dup);
1899
1900 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_dup);
1901 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_dup);
1902 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_dup);
1903 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_dup);
1904 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_dup);
1905 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_dup);
1906
1907 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld1r);
1908 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld1r);
1909 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld1r);
1910 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld1r);
1911 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld1r);
1912 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld1r);
1913
1914 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld1r);
1915 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld1r);
1916 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld1r);
1917 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld1r);
1918 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld1r);
1919 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld1r);
1920
1921 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld2r);
1922 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld2r);
1923 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld2r);
1924 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld2r);
1925 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld2r);
1926 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld2r);
1927
1928 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld2r);
1929 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld2r);
1930 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld2r);
1931 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld2r);
1932 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld2r);
1933 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld2r);
1934
1935 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld4r);
1936 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld4r);
1937 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld4r);
1938 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld4r);
1939 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld4r);
1940 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld4r);
1941
1942 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld4r);
1943 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld4r);
1944 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld4r);
1945 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld4r);
1946 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld4r);
1947 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld4r);
1948
1949 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mlal);
1950 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mlal);
1951 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mlal);
1952 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mlal);
1953 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mlal);
1954 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mlal);
1955
1956 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mull);
1957 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mull);
1958 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mull);
1959 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mull);
1960 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mull);
1961 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mull);
1962
1963 BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane);
1964 BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane);
1965 BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane);
1966 BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane);
1967 BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane);
1968 BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane);
1969 BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane);
1970 BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane);
1971
1972 BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane_prfm);
1973 BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane_prfm);
1974 BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane_prfm);
1975 BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane_prfm);
1976 BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane_prfm);
1977 BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane_prfm);
1978 BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane_prfm);
1979 BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane_prfm);
1980 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1981
1982 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)1983 static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
1984 GEMMEnd2EndBenchmark(state, model,
1985 xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
1986 xnn_qs8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
1987 xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1988 xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1989 xnn_init_qs8_conv_minmax_fp32_avx512_params,
1990 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1991 benchmark::utils::CheckAVX512F);
1992 }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)1993 static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
1994 GEMMEnd2EndBenchmark(state, model,
1995 xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
1996 xnn_qs8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
1997 xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1998 xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
1999 xnn_init_qs8_conv_minmax_fp32_avx512_params,
2000 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2001 benchmark::utils::CheckAVX512F);
2002 }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2003 static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2004 GEMMEnd2EndBenchmark(state, model,
2005 xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2006 xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2007 xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2008 xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2009 xnn_init_qs8_conv_minmax_fp32_avx512_params,
2010 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2011 benchmark::utils::CheckAVX512F);
2012 }
qs8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2013 static void qs8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2014 GEMMEnd2EndBenchmark(state, model,
2015 xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
2016 xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
2017 xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2018 xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2019 xnn_init_qs8_conv_minmax_fp32_avx2_params,
2020 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2021 benchmark::utils::CheckAVX2);
2022 }
qs8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2023 static void qs8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2024 GEMMEnd2EndBenchmark(state, model,
2025 xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
2026 xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
2027 xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2028 xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2029 xnn_init_qs8_conv_minmax_fp32_avx2_params,
2030 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2031 benchmark::utils::CheckAVX2);
2032 }
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2033 static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2034 GEMMEnd2EndBenchmark(state, model,
2035 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2036 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2037 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2038 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2039 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2040 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2041 benchmark::utils::CheckXOP);
2042 }
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2043 static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2044 GEMMEnd2EndBenchmark(state, model,
2045 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2046 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2047 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2048 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2049 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2050 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2051 benchmark::utils::CheckXOP);
2052 }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2053 static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2054 GEMMEnd2EndBenchmark(state, model,
2055 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2056 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2057 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2058 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2059 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2060 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2061 benchmark::utils::CheckXOP);
2062 }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2063 static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2064 GEMMEnd2EndBenchmark(state, model,
2065 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2066 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2067 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2068 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2069 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2070 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2071 benchmark::utils::CheckXOP);
2072 }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2073 static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2074 GEMMEnd2EndBenchmark(state, model,
2075 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2076 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2077 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2078 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2079 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2080 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2081 benchmark::utils::CheckXOP);
2082 }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2083 static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2084 GEMMEnd2EndBenchmark(state, model,
2085 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2086 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2087 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2088 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2089 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2090 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2091 benchmark::utils::CheckXOP);
2092 }
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2093 static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2094 GEMMEnd2EndBenchmark(state, model,
2095 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2096 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2097 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2098 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2099 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2100 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2101 benchmark::utils::CheckXOP);
2102 }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2103 static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2104 GEMMEnd2EndBenchmark(state, model,
2105 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2106 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2107 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2108 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2109 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2110 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2111 benchmark::utils::CheckXOP);
2112 }
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2113 static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2114 GEMMEnd2EndBenchmark(state, model,
2115 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2116 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2117 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2118 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2119 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2120 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2121 benchmark::utils::CheckXOP);
2122 }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2123 static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2124 GEMMEnd2EndBenchmark(state, model,
2125 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2126 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2127 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2128 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2129 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2130 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2131 benchmark::utils::CheckXOP);
2132 }
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2133 static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2134 GEMMEnd2EndBenchmark(state, model,
2135 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2136 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2137 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2138 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2139 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2140 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2141 benchmark::utils::CheckAVX);
2142 }
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2143 static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2144 GEMMEnd2EndBenchmark(state, model,
2145 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2146 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2147 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2148 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2149 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2150 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2151 benchmark::utils::CheckAVX);
2152 }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2153 static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2154 GEMMEnd2EndBenchmark(state, model,
2155 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2156 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2157 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2158 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2159 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2160 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2161 benchmark::utils::CheckAVX);
2162 }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2163 static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2164 GEMMEnd2EndBenchmark(state, model,
2165 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2166 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2167 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2168 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2169 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2170 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2171 benchmark::utils::CheckAVX);
2172 }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2173 static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2174 GEMMEnd2EndBenchmark(state, model,
2175 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2176 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2177 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2178 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2179 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2180 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2181 benchmark::utils::CheckAVX);
2182 }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2183 static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2184 GEMMEnd2EndBenchmark(state, model,
2185 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2186 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2187 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2188 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2189 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2190 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2191 benchmark::utils::CheckAVX);
2192 }
2193
2194
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2195 static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2196 GEMMEnd2EndBenchmark(state, model,
2197 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2198 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2199 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2200 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2201 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2202 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2203 benchmark::utils::CheckAVX);
2204 }
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2205 static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2206 GEMMEnd2EndBenchmark(state, model,
2207 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2208 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2209 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2210 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2211 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2212 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2213 benchmark::utils::CheckAVX);
2214 }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2215 static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2216 GEMMEnd2EndBenchmark(state, model,
2217 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2218 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2219 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2220 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2221 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2222 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2223 benchmark::utils::CheckAVX);
2224 }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2225 static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2226 GEMMEnd2EndBenchmark(state, model,
2227 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2228 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2229 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2230 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2231 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2232 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2233 benchmark::utils::CheckAVX);
2234 }
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2235 static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2236 GEMMEnd2EndBenchmark(state, model,
2237 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2238 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2239 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2240 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2241 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2242 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2243 benchmark::utils::CheckSSE41);
2244 }
qs8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2245 static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2246 GEMMEnd2EndBenchmark(state, model,
2247 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2248 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2249 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2250 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2251 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2252 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2253 benchmark::utils::CheckSSE41);
2254 }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2255 static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2256 GEMMEnd2EndBenchmark(state, model,
2257 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2258 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2259 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2260 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2261 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2262 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2263 benchmark::utils::CheckSSE41);
2264 }
qs8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2265 static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2266 GEMMEnd2EndBenchmark(state, model,
2267 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2268 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2269 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2270 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2271 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2272 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2273 benchmark::utils::CheckSSE41);
2274 }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2275 static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2276 GEMMEnd2EndBenchmark(state, model,
2277 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2278 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2279 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2280 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2281 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2282 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2283 benchmark::utils::CheckSSE41);
2284 }
qs8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2285 static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2286 GEMMEnd2EndBenchmark(state, model,
2287 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2288 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2289 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2290 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2291 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2292 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2293 benchmark::utils::CheckSSE41);
2294 }
qs8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2295 static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2296 GEMMEnd2EndBenchmark(state, model,
2297 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2298 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2299 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2300 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2301 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2302 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2303 benchmark::utils::CheckSSE41);
2304 }
qs8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2305 static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2306 GEMMEnd2EndBenchmark(state, model,
2307 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2308 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2309 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2310 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2311 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2312 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2313 benchmark::utils::CheckSSE41);
2314 }
qs8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2315 static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2316 GEMMEnd2EndBenchmark(state, model,
2317 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2318 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2319 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2320 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2321 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2322 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2323 benchmark::utils::CheckSSE41);
2324 }
qs8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2325 static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2326 GEMMEnd2EndBenchmark(state, model,
2327 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2328 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2329 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2330 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2331 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2332 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2333 benchmark::utils::CheckSSE41);
2334 }
2335
2336
qs8_gemm_2x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2337 static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2338 GEMMEnd2EndBenchmark(state, model,
2339 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2340 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2341 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2342 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2343 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2344 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2345 benchmark::utils::CheckSSSE3);
2346 }
qs8_gemm_2x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2347 static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2348 GEMMEnd2EndBenchmark(state, model,
2349 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2350 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2351 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2352 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2353 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2354 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2355 benchmark::utils::CheckSSSE3);
2356 }
qs8_gemm_3x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2357 static void qs8_gemm_3x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2358 GEMMEnd2EndBenchmark(state, model,
2359 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2360 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2361 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2362 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2363 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2364 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2365 benchmark::utils::CheckSSSE3);
2366 }
qs8_gemm_3x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2367 static void qs8_gemm_3x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2368 GEMMEnd2EndBenchmark(state, model,
2369 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2370 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2371 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2372 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2373 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2374 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2375 benchmark::utils::CheckSSSE3);
2376 }
qs8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2377 static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2378 GEMMEnd2EndBenchmark(state, model,
2379 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2380 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2381 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2382 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2383 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2384 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2385 }
qs8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2386 static void qs8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2387 GEMMEnd2EndBenchmark(state, model,
2388 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2389 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2390 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2391 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2392 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2393 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2394 }
qs8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2395 static void qs8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2396 GEMMEnd2EndBenchmark(state, model,
2397 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2398 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2399 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2400 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2401 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2402 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2403 }
qs8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2404 static void qs8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2405 GEMMEnd2EndBenchmark(state, model,
2406 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2407 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2408 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2409 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2410 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2411 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2412 }
qs8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2413 static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2414 GEMMEnd2EndBenchmark(state, model,
2415 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2416 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2417 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2418 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2419 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2420 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2421 }
qs8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2422 static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2423 GEMMEnd2EndBenchmark(state, model,
2424 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2425 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2426 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2427 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2428 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2429 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2430 }
qs8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2431 static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2432 GEMMEnd2EndBenchmark(state, model,
2433 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2434 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2435 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2436 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2437 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2438 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2439 }
qs8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2440 static void qs8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2441 GEMMEnd2EndBenchmark(state, model,
2442 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2443 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2444 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2445 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2446 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2447 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2448 }
qs8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2449 static void qs8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2450 GEMMEnd2EndBenchmark(state, model,
2451 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2452 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2453 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2454 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2455 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2456 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2457 }
qs8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2458 static void qs8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2459 GEMMEnd2EndBenchmark(state, model,
2460 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2461 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2462 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2463 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2464 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2465 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2466 }
2467
2468
2469 BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__avx512skx);
2470 BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__avx512skx);
2471 BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__avx512skx);
2472
2473 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__avx2);
2474 BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__avx2);
2475
2476 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld64);
2477 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld128);
2478 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld64);
2479 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld128);
2480 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld64);
2481 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld128);
2482
2483 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld64);
2484 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld128);
2485 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld64);
2486 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld128);
2487
2488 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld64);
2489 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld128);
2490 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld64);
2491 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld128);
2492 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld64);
2493 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld128);
2494
2495 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld64);
2496 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld128);
2497 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld64);
2498 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld128);
2499
2500 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld64);
2501 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld128);
2502 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld64);
2503 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld128);
2504 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld64);
2505 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld128);
2506
2507 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld64);
2508 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld128);
2509 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld64);
2510 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld128);
2511
2512 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld64);
2513 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld128);
2514 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld64);
2515 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld128);
2516
2517 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld64);
2518 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld128);
2519 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld64);
2520 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld128);
2521 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld64);
2522 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld128);
2523
2524 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld64);
2525 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld128);
2526 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld64);
2527 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld128);
2528 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2529
2530 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2531 static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2532 GEMMEnd2EndBenchmark(state, model,
2533 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2534 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2535 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2536 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2537 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2538 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2539 }
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2540 static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2541 GEMMEnd2EndBenchmark(state, model,
2542 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2543 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2544 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2545 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2546 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2547 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2548 }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2549 static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2550 GEMMEnd2EndBenchmark(state, model,
2551 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2552 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2553 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2554 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2555 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2556 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2557 }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2558 static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2559 GEMMEnd2EndBenchmark(state, model,
2560 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2561 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2562 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2563 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2564 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2565 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2566 }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2567 static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2568 GEMMEnd2EndBenchmark(state, model,
2569 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2570 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2571 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2572 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2573 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2574 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2575 }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2576 static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2577 GEMMEnd2EndBenchmark(state, model,
2578 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2579 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2580 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2581 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2582 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2583 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2584 }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2585 static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2586 GEMMEnd2EndBenchmark(state, model,
2587 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2588 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2589 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2590 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2591 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2592 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2593 }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2594 static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2595 GEMMEnd2EndBenchmark(state, model,
2596 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2597 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2598 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2599 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2600 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2601 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2602 }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2603 static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2604 GEMMEnd2EndBenchmark(state, model,
2605 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2606 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2607 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2608 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2609 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2610 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2611 }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2612 static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2613 GEMMEnd2EndBenchmark(state, model,
2614 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2615 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2616 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2617 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2618 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2619 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2620 }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2621 static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2622 GEMMEnd2EndBenchmark(state, model,
2623 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2624 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2625 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2626 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2627 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2628 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2629 }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2630 static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2631 GEMMEnd2EndBenchmark(state, model,
2632 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2633 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2634 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2635 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2636 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2637 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2638 }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2639 static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2640 GEMMEnd2EndBenchmark(state, model,
2641 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2642 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2643 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2644 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2645 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2646 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2647 }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2648 static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2649 GEMMEnd2EndBenchmark(state, model,
2650 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2651 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2652 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2653 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2654 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2655 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2656 }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2657 static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2658 GEMMEnd2EndBenchmark(state, model,
2659 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
2660 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
2661 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2662 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2663 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2664 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2665 }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2666 static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2667 GEMMEnd2EndBenchmark(state, model,
2668 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
2669 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
2670 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2671 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2672 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2673 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2674 }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2675 static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2676 GEMMEnd2EndBenchmark(state, model,
2677 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
2678 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
2679 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2680 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2681 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2682 4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2683 }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2684 static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2685 GEMMEnd2EndBenchmark(state, model,
2686 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
2687 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
2688 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2689 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2690 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2691 4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2692 }
qs8_gemm_2x4c8__wasmsimd_mul16_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2693 static void qs8_gemm_2x4c8__wasmsimd_mul16_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2694 GEMMEnd2EndBenchmark(state, model,
2695 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64,
2696 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64,
2697 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2698 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2699 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2700 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2701 }
qs8_gemm_2x4c8__wasmsimd_mul16_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2702 static void qs8_gemm_2x4c8__wasmsimd_mul16_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2703 GEMMEnd2EndBenchmark(state, model,
2704 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128,
2705 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128,
2706 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2707 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2708 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2709 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2710 }
qs8_gemm_3x4c8__wasmsimd_mul16_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2711 static void qs8_gemm_3x4c8__wasmsimd_mul16_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2712 GEMMEnd2EndBenchmark(state, model,
2713 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64,
2714 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64,
2715 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2716 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64,
2717 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2718 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2719 }
qs8_gemm_3x4c8__wasmsimd_mul16_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2720 static void qs8_gemm_3x4c8__wasmsimd_mul16_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2721 GEMMEnd2EndBenchmark(state, model,
2722 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128,
2723 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128,
2724 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2725 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128,
2726 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2727 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2728 }
2729
2730 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)2731 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
2732 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
2733 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
2734 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
2735 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
2736
2737 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
2738 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
2739 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
2740 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
2741 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
2742 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
2743
2744 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
2745 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
2746 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
2747 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
2748 BENCHMARK_QS8_END2END(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
2749 BENCHMARK_QS8_END2END(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
2750
2751 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_mul16_ld64)
2752 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_mul16_ld128)
2753 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_mul16_ld64)
2754 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_mul16_ld128)
2755 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2756
2757
2758 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2759 static void qs8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2760 GEMMEnd2EndBenchmark(state, model,
2761 xnn_qs8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
2762 xnn_qs8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
2763 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2764 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2765 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2766 2 /* mr */, 2 /* nr */);
2767 }
qs8_gemm_3x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2768 static void qs8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2769 GEMMEnd2EndBenchmark(state, model,
2770 xnn_qs8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
2771 xnn_qs8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
2772 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2773 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2774 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2775 3 /* mr */, 2 /* nr */);
2776 }
qs8_gemm_4x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2777 static void qs8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2778 GEMMEnd2EndBenchmark(state, model,
2779 xnn_qs8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
2780 xnn_qs8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
2781 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2782 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
2783 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2784 4 /* mr */, 2 /* nr */);
2785 }
qs8_gemm_2x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2786 static void qs8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2787 GEMMEnd2EndBenchmark(state, model,
2788 xnn_qs8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
2789 xnn_qs8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
2790 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2791 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2792 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2793 2 /* mr */, 4 /* nr */);
2794 }
qs8_gemm_3x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2795 static void qs8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2796 GEMMEnd2EndBenchmark(state, model,
2797 xnn_qs8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
2798 xnn_qs8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
2799 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2800 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2801 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2802 3 /* mr */, 4 /* nr */);
2803 }
qs8_gemm_4x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2804 static void qs8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2805 GEMMEnd2EndBenchmark(state, model,
2806 xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
2807 xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
2808 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2809 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
2810 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2811 4 /* mr */, 4 /* nr */);
2812 }
2813
2814 BENCHMARK_QS8_END2END(qs8_gemm_2x2__wasm_fmagic)
BENCHMARK_QS8_END2END(qs8_gemm_3x2__wasm_fmagic)2815 BENCHMARK_QS8_END2END(qs8_gemm_3x2__wasm_fmagic)
2816 BENCHMARK_QS8_END2END(qs8_gemm_4x2__wasm_fmagic)
2817 BENCHMARK_QS8_END2END(qs8_gemm_2x4__wasm_fmagic)
2818 BENCHMARK_QS8_END2END(qs8_gemm_3x4__wasm_fmagic)
2819 BENCHMARK_QS8_END2END(qs8_gemm_4x4__wasm_fmagic)
2820 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2821
2822
2823 static void qs8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2824 GEMMEnd2EndBenchmark(state, model,
2825 xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
2826 xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
2827 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2828 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2829 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2830 2 /* mr */, 2 /* nr */);
2831 }
qs8_gemm_3x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2832 static void qs8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2833 GEMMEnd2EndBenchmark(state, model,
2834 xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
2835 xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
2836 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2837 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2838 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2839 3 /* mr */, 2 /* nr */);
2840 }
qs8_gemm_4x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2841 static void qs8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2842 GEMMEnd2EndBenchmark(state, model,
2843 xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
2844 xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
2845 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2846 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
2847 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2848 4 /* mr */, 2 /* nr */);
2849 }
qs8_gemm_2x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2850 static void qs8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2851 GEMMEnd2EndBenchmark(state, model,
2852 xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
2853 xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
2854 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2855 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2856 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2857 2 /* mr */, 4 /* nr */);
2858 }
qs8_gemm_3x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2859 static void qs8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2860 GEMMEnd2EndBenchmark(state, model,
2861 xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
2862 xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
2863 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2864 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2865 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2866 3 /* mr */, 4 /* nr */);
2867 }
qs8_gemm_4x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)2868 static void qs8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2869 GEMMEnd2EndBenchmark(state, model,
2870 xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
2871 xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
2872 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2873 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
2874 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
2875 4 /* mr */, 4 /* nr */);
2876 }
2877
qs8_gemm_2x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2878 static void qs8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2879 GEMMEnd2EndBenchmark(state, model,
2880 xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
2881 xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic,
2882 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2883 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2884 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2885 2 /* mr */, 2 /* nr */);
2886 }
qs8_gemm_3x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2887 static void qs8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2888 GEMMEnd2EndBenchmark(state, model,
2889 xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
2890 xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic,
2891 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2892 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2893 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2894 3 /* mr */, 2 /* nr */);
2895 }
qs8_gemm_4x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2896 static void qs8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2897 GEMMEnd2EndBenchmark(state, model,
2898 xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
2899 xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic,
2900 xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2901 xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
2902 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2903 4 /* mr */, 2 /* nr */);
2904 }
qs8_gemm_2x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2905 static void qs8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2906 GEMMEnd2EndBenchmark(state, model,
2907 xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
2908 xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic,
2909 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2910 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2911 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2912 2 /* mr */, 4 /* nr */);
2913 }
qs8_gemm_3x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2914 static void qs8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
2915 GEMMEnd2EndBenchmark(state, model,
2916 xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
2917 xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic,
2918 xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2919 xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
2920 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
2921 3 /* mr */, 4 /* nr */);
2922 }
qs8_gemm_4x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)2923