1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <xnnpack.h>
13
14 #include <benchmark/benchmark.h>
15
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/gemm.h>
20 #include <xnnpack/igemm.h>
21 #include <xnnpack/params.h>
22 #include <xnnpack/params-init.h>
23
24
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_qu8_igemm_minmax_ukernel_function igemm,xnn_qu8_gemm_minmax_ukernel_function gemm1,xnn_qu8_igemm_minmax_ukernel_function igemm1,xnn_init_qu8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void GEMMEnd2EndBenchmark(
26 benchmark::State& state,
27 models::ExecutionPlanFactory model_factory,
28 xnn_qu8_gemm_minmax_ukernel_function gemm,
29 xnn_qu8_igemm_minmax_ukernel_function igemm,
30 xnn_qu8_gemm_minmax_ukernel_function gemm1,
31 xnn_qu8_igemm_minmax_ukernel_function igemm1,
32 xnn_init_qu8_conv_minmax_params_fn init_params,
33 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
34 benchmark::utils::IsaCheckFunction isa_check = nullptr)
35 {
36 if (isa_check && !isa_check(state)) {
37 return;
38 }
39 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
40 state.SkipWithError("failed to initialize XNNPACK");
41 return;
42 }
43
44 // Override microkernels chosen in xnn_initialize
45 // Note: do not directly assign to xnn_params.qu8.gemm because it breaks older gcc.
46 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
47 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
48 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
49 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
50 xnn_params.qu8.gemm.init.qu8 = init_params;
51 xnn_params.qu8.gemm.mr = mr;
52 xnn_params.qu8.gemm.nr = nr;
53 xnn_params.qu8.gemm.log2_kr = log2_kr;
54 xnn_params.qu8.gemm.log2_sr = log2_sr;
55
56 auto execution_plan = model_factory(nullptr);
57 if (execution_plan.empty()) {
58 state.SkipWithError("failed to create a model");
59 return;
60 }
61
62 for (auto _ : state) {
63 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
64 xnn_status status = xnn_run_operator(op.get(), nullptr);
65 if (status != xnn_status_success) {
66 state.SkipWithError("failed to run a model");
67 return;
68 }
69 }
70 }
71
72 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
73 if (cpu_frequency != 0) {
74 state.counters["cpufreq"] = cpu_frequency;
75 }
76 }
77
78 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)79 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
80 GEMMEnd2EndBenchmark(state, model,
81 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
82 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
83 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
84 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
85 xnn_init_qu8_conv_minmax_rndnu_neon_params,
86 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
87 benchmark::utils::CheckNEON);
88 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)89 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
90 GEMMEnd2EndBenchmark(state, model,
91 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
92 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
93 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
94 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
95 xnn_init_qu8_conv_minmax_rndnu_neon_params,
96 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
97 benchmark::utils::CheckNEON);
98 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)99 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
100 GEMMEnd2EndBenchmark(state, model,
101 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
102 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
103 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
104 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
105 xnn_init_qu8_conv_minmax_rndnu_neon_params,
106 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
107 benchmark::utils::CheckNEON);
108 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)109 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
110 GEMMEnd2EndBenchmark(state, model,
111 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
112 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
113 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
114 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
115 xnn_init_qu8_conv_minmax_rndnu_neon_params,
116 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
117 benchmark::utils::CheckNEON);
118 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)119 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
120 GEMMEnd2EndBenchmark(state, model,
121 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
122 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
123 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
124 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
125 xnn_init_qu8_conv_minmax_rndnu_neon_params,
126 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
127 benchmark::utils::CheckNEON);
128 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)129 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
130 GEMMEnd2EndBenchmark(state, model,
131 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
132 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
133 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
134 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
135 xnn_init_qu8_conv_minmax_rndnu_neon_params,
136 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
137 benchmark::utils::CheckNEON);
138 }
139 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)140 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
141 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
142 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
143 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
144 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
145 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
146
147 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
148 static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
149 GEMMEnd2EndBenchmark(state, model,
150 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
151 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
152 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
153 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
154 xnn_init_qu8_conv_minmax_rndnu_neon_params,
155 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
156 benchmark::utils::CheckNEONDOT);
157 }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)158 static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
159 GEMMEnd2EndBenchmark(state, model,
160 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
161 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
162 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
163 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
164 xnn_init_qu8_conv_minmax_rndnu_neon_params,
165 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
166 benchmark::utils::CheckNEONDOT);
167 }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)168 static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
169 GEMMEnd2EndBenchmark(state, model,
170 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
171 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
172 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
173 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
174 xnn_init_qu8_conv_minmax_rndnu_neon_params,
175 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
176 benchmark::utils::CheckNEONDOT);
177 }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)178 static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
179 GEMMEnd2EndBenchmark(state, model,
180 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
181 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
182 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
183 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
184 xnn_init_qu8_conv_minmax_rndnu_neon_params,
185 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
186 benchmark::utils::CheckNEONDOT);
187 }
188
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)189 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
190 GEMMEnd2EndBenchmark(state, model,
191 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
192 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
193 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
194 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
195 xnn_init_qu8_conv_minmax_rndnu_neon_params,
196 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
197 benchmark::utils::CheckNEON);
198 }
199
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)200 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
201 GEMMEnd2EndBenchmark(state, model,
202 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
203 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
204 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
205 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
206 xnn_init_qu8_conv_minmax_rndnu_neon_params,
207 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
208 benchmark::utils::CheckNEON);
209 }
210
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)211 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
212 GEMMEnd2EndBenchmark(state, model,
213 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
214 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
215 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
216 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
217 xnn_init_qu8_conv_minmax_rndnu_neon_params,
218 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
219 benchmark::utils::CheckNEON);
220 }
221
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)222 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
223 GEMMEnd2EndBenchmark(state, model,
224 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
225 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
226 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
227 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
228 xnn_init_qu8_conv_minmax_rndnu_neon_params,
229 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
230 benchmark::utils::CheckNEON);
231 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)232 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
233 GEMMEnd2EndBenchmark(state, model,
234 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
235 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
236 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
237 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
238 xnn_init_qu8_conv_minmax_rndnu_neon_params,
239 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
240 benchmark::utils::CheckNEON);
241 }
242
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)243 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
244 GEMMEnd2EndBenchmark(state, model,
245 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
246 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
247 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
248 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
249 xnn_init_qu8_conv_minmax_rndnu_neon_params,
250 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
251 benchmark::utils::CheckNEON);
252 }
253 BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55);
254 BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55);
255 BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_ld128);
256 BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_ld128);
257 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75);
258 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
259 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53);
260 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
261 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64);
262 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64);
263 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
264
265 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)266 static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
267 GEMMEnd2EndBenchmark(state, model,
268 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
269 xnn_qu8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
270 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
271 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
272 xnn_init_qu8_conv_minmax_rndnu_neon_params,
273 2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
274 benchmark::utils::CheckNEON);
275 }
276
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)277 static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
278 GEMMEnd2EndBenchmark(state, model,
279 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
280 xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
281 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
282 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
283 xnn_init_qu8_conv_minmax_rndnu_neon_params,
284 3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
285 benchmark::utils::CheckNEON);
286 }
287
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)288 static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
289 GEMMEnd2EndBenchmark(state, model,
290 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
291 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
292 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
293 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
294 xnn_init_qu8_conv_minmax_rndnu_neon_params,
295 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
296 benchmark::utils::CheckNEON);
297 }
298
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)299 static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
300 GEMMEnd2EndBenchmark(state, model,
301 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
302 xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
303 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
304 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
305 xnn_init_qu8_conv_minmax_rndnu_neon_params,
306 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
307 benchmark::utils::CheckNEON);
308 }
309
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)310 static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
311 GEMMEnd2EndBenchmark(state, model,
312 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
313 xnn_qu8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
314 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
315 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
316 xnn_init_qu8_conv_minmax_rndnu_neon_params,
317 2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
318 benchmark::utils::CheckNEON);
319 }
320
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)321 static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
322 GEMMEnd2EndBenchmark(state, model,
323 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
324 xnn_qu8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
325 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
326 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
327 xnn_init_qu8_conv_minmax_rndnu_neon_params,
328 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
329 benchmark::utils::CheckNEON);
330 }
331
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)332 static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
333 GEMMEnd2EndBenchmark(state, model,
334 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
335 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
336 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
337 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
338 xnn_init_qu8_conv_minmax_rndnu_neon_params,
339 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
340 benchmark::utils::CheckNEON);
341 }
342
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)343 static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
344 GEMMEnd2EndBenchmark(state, model,
345 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
346 xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
347 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
348 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
349 xnn_init_qu8_conv_minmax_rndnu_neon_params,
350 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
351 benchmark::utils::CheckNEON);
352 }
353
qu8_gemm_1x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)354 static void qu8_gemm_1x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
355 GEMMEnd2EndBenchmark(state, model,
356 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
357 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
358 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
359 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
360 xnn_init_qu8_conv_minmax_rndnu_neon_params,
361 1 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
362 benchmark::utils::CheckNEONDOT);
363 }
qu8_gemm_2x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)364 static void qu8_gemm_2x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
365 GEMMEnd2EndBenchmark(state, model,
366 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
367 xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c4__neondot,
368 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
369 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
370 xnn_init_qu8_conv_minmax_rndnu_neon_params,
371 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
372 benchmark::utils::CheckNEONDOT);
373 }
qu8_gemm_3x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)374 static void qu8_gemm_3x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
375 GEMMEnd2EndBenchmark(state, model,
376 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
377 xnn_qu8_igemm_minmax_rndnu_ukernel_3x8c4__neondot,
378 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
379 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
380 xnn_init_qu8_conv_minmax_rndnu_neon_params,
381 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
382 benchmark::utils::CheckNEONDOT);
383 }
qu8_gemm_4x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)384 static void qu8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
385 GEMMEnd2EndBenchmark(state, model,
386 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
387 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
388 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
389 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
390 xnn_init_qu8_conv_minmax_rndnu_neon_params,
391 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
392 benchmark::utils::CheckNEONDOT);
393 }
qu8_gemm_5x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)394 static void qu8_gemm_5x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
395 GEMMEnd2EndBenchmark(state, model,
396 xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
397 xnn_qu8_igemm_minmax_rndnu_ukernel_5x8c4__neondot,
398 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
399 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
400 xnn_init_qu8_conv_minmax_rndnu_neon_params,
401 5 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
402 benchmark::utils::CheckNEONDOT);
403 }
qu8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)404 static void qu8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
405 GEMMEnd2EndBenchmark(state, model,
406 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
407 xnn_qu8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
408 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
409 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
410 xnn_init_qu8_conv_minmax_rndnu_neon_params,
411 6 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
412 benchmark::utils::CheckNEONDOT);
413 }
qu8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)414 static void qu8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
415 GEMMEnd2EndBenchmark(state, model,
416 xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
417 xnn_qu8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
418 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
419 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
420 xnn_init_qu8_conv_minmax_rndnu_neon_params,
421 8 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
422 benchmark::utils::CheckNEONDOT);
423 }
qu8_gemm_1x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)424 static void qu8_gemm_1x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
425 GEMMEnd2EndBenchmark(state, model,
426 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
427 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
428 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
429 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
430 xnn_init_qu8_conv_minmax_rndnu_neon_params,
431 1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
432 benchmark::utils::CheckNEONDOT);
433 }
qu8_gemm_2x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)434 static void qu8_gemm_2x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
435 GEMMEnd2EndBenchmark(state, model,
436 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
437 xnn_qu8_igemm_minmax_rndnu_ukernel_2x16c4__neondot,
438 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
439 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
440 xnn_init_qu8_conv_minmax_rndnu_neon_params,
441 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
442 benchmark::utils::CheckNEONDOT);
443 }
qu8_gemm_3x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)444 static void qu8_gemm_3x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
445 GEMMEnd2EndBenchmark(state, model,
446 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
447 xnn_qu8_igemm_minmax_rndnu_ukernel_3x16c4__neondot,
448 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
449 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
450 xnn_init_qu8_conv_minmax_rndnu_neon_params,
451 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
452 benchmark::utils::CheckNEONDOT);
453 }
qu8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)454 static void qu8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
455 GEMMEnd2EndBenchmark(state, model,
456 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
457 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
458 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
459 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
460 xnn_init_qu8_conv_minmax_rndnu_neon_params,
461 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
462 benchmark::utils::CheckNEONDOT);
463 }
qu8_gemm_5x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)464 static void qu8_gemm_5x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
465 GEMMEnd2EndBenchmark(state, model,
466 xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
467 xnn_qu8_igemm_minmax_rndnu_ukernel_5x16c4__neondot,
468 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
469 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
470 xnn_init_qu8_conv_minmax_rndnu_neon_params,
471 5 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
472 benchmark::utils::CheckNEONDOT);
473 }
qu8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)474 static void qu8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
475 GEMMEnd2EndBenchmark(state, model,
476 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
477 xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
478 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
479 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
480 xnn_init_qu8_conv_minmax_rndnu_neon_params,
481 6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
482 benchmark::utils::CheckNEONDOT);
483 }
qu8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)484 static void qu8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
485 GEMMEnd2EndBenchmark(state, model,
486 xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
487 xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
488 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
489 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
490 xnn_init_qu8_conv_minmax_rndnu_neon_params,
491 8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
492 benchmark::utils::CheckNEONDOT);
493 }
qu8_gemm_2x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)494 static void qu8_gemm_2x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
495 GEMMEnd2EndBenchmark(state, model,
496 xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
497 xnn_qu8_igemm_minmax_rndnu_ukernel_2x32c4__neondot,
498 xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
499 xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
500 xnn_init_qu8_conv_minmax_rndnu_neon_params,
501 2 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
502 benchmark::utils::CheckNEONDOT);
503 }
qu8_gemm_3x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)504 static void qu8_gemm_3x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
505 GEMMEnd2EndBenchmark(state, model,
506 xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
507 xnn_qu8_igemm_minmax_rndnu_ukernel_3x32c4__neondot,
508 xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
509 xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
510 xnn_init_qu8_conv_minmax_rndnu_neon_params,
511 3 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
512 benchmark::utils::CheckNEONDOT);
513 }
514 BENCHMARK_QU8_END2END(qu8_gemm_1x8c4__neondot);
515 BENCHMARK_QU8_END2END(qu8_gemm_2x8c4__neondot);
516 BENCHMARK_QU8_END2END(qu8_gemm_3x8c4__neondot);
517 BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__neondot);
518 BENCHMARK_QU8_END2END(qu8_gemm_5x8c4__neondot);
519 BENCHMARK_QU8_END2END(qu8_gemm_6x8c4__neondot);
520 BENCHMARK_QU8_END2END(qu8_gemm_8x8c4__neondot);
521 BENCHMARK_QU8_END2END(qu8_gemm_1x16c4__neondot);
522 BENCHMARK_QU8_END2END(qu8_gemm_2x16c4__neondot);
523 BENCHMARK_QU8_END2END(qu8_gemm_3x16c4__neondot);
524 BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__neondot);
525 BENCHMARK_QU8_END2END(qu8_gemm_5x16c4__neondot);
526 BENCHMARK_QU8_END2END(qu8_gemm_6x16c4__neondot);
527 BENCHMARK_QU8_END2END(qu8_gemm_8x16c4__neondot);
528 BENCHMARK_QU8_END2END(qu8_gemm_2x32c4__neondot);
529 BENCHMARK_QU8_END2END(qu8_gemm_3x32c4__neondot);
530
531 BENCHMARK_QU8_END2END(qu8_gemm_2x8__neon_mlal_lane);
532 BENCHMARK_QU8_END2END(qu8_gemm_3x8__neon_mlal_lane);
533 BENCHMARK_QU8_END2END(qu8_gemm_4x8__neon_mlal_lane);
534 BENCHMARK_QU8_END2END(qu8_gemm_6x8__neon_mlal_lane);
535 BENCHMARK_QU8_END2END(qu8_gemm_2x16__neon_mlal_lane);
536 BENCHMARK_QU8_END2END(qu8_gemm_3x16__neon_mlal_lane);
537 BENCHMARK_QU8_END2END(qu8_gemm_4x16__neon_mlal_lane);
538 BENCHMARK_QU8_END2END(qu8_gemm_6x16__neon_mlal_lane);
539 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
540
541 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)542 static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
543 GEMMEnd2EndBenchmark(state, model,
544 xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
545 xnn_qu8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
546 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
547 xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
548 xnn_init_qu8_conv_minmax_fp32_avx512_params,
549 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
550 benchmark::utils::CheckAVX512F);
551 }
552
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)553 static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
554 GEMMEnd2EndBenchmark(state, model,
555 xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
556 xnn_qu8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
557 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
558 xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
559 xnn_init_qu8_conv_minmax_fp32_avx512_params,
560 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
561 benchmark::utils::CheckAVX512F);
562 }
563
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)564 static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
565 GEMMEnd2EndBenchmark(state, model,
566 xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
567 xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
568 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
569 xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
570 xnn_init_qu8_conv_minmax_fp32_avx512_params,
571 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
572 benchmark::utils::CheckAVX512F);
573 }
574
qu8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)575 static void qu8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
576 GEMMEnd2EndBenchmark(state, model,
577 xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
578 xnn_qu8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
579 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
580 xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
581 xnn_init_qu8_conv_minmax_fp32_avx2_params,
582 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
583 benchmark::utils::CheckAVX2);
584 }
qu8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)585 static void qu8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
586 GEMMEnd2EndBenchmark(state, model,
587 xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
588 xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
589 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
590 xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
591 xnn_init_qu8_conv_minmax_fp32_avx2_params,
592 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
593 benchmark::utils::CheckAVX2);
594 }
595
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)596 static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
597 GEMMEnd2EndBenchmark(state, model,
598 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
599 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
600 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
601 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
602 xnn_init_qu8_conv_minmax_fp32_sse2_params,
603 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
604 benchmark::utils::CheckXOP);
605 }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)606 static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
607 GEMMEnd2EndBenchmark(state, model,
608 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
609 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
610 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
611 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
612 xnn_init_qu8_conv_minmax_fp32_sse2_params,
613 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
614 benchmark::utils::CheckXOP);
615 }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)616 static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
617 GEMMEnd2EndBenchmark(state, model,
618 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
619 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
620 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
621 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
622 xnn_init_qu8_conv_minmax_fp32_sse2_params,
623 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
624 benchmark::utils::CheckXOP);
625 }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)626 static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
627 GEMMEnd2EndBenchmark(state, model,
628 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
629 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
630 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
631 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
632 xnn_init_qu8_conv_minmax_fp32_sse2_params,
633 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
634 benchmark::utils::CheckXOP);
635 }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)636 static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
637 GEMMEnd2EndBenchmark(state, model,
638 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
639 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
640 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
641 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
642 xnn_init_qu8_conv_minmax_fp32_sse2_params,
643 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
644 benchmark::utils::CheckXOP);
645 }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)646 static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
647 GEMMEnd2EndBenchmark(state, model,
648 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
649 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
650 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
651 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
652 xnn_init_qu8_conv_minmax_fp32_sse2_params,
653 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
654 benchmark::utils::CheckXOP);
655 }
656
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)657 static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
658 GEMMEnd2EndBenchmark(state, model,
659 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
660 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
661 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
662 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
663 xnn_init_qu8_conv_minmax_fp32_sse2_params,
664 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
665 benchmark::utils::CheckXOP);
666 }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)667 static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
668 GEMMEnd2EndBenchmark(state, model,
669 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
670 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
671 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
672 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
673 xnn_init_qu8_conv_minmax_fp32_sse2_params,
674 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
675 benchmark::utils::CheckXOP);
676 }
677
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)678 static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
679 GEMMEnd2EndBenchmark(state, model,
680 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
681 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
682 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
683 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
684 xnn_init_qu8_conv_minmax_fp32_sse2_params,
685 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
686 benchmark::utils::CheckXOP);
687 }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)688 static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
689 GEMMEnd2EndBenchmark(state, model,
690 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
691 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
692 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
693 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
694 xnn_init_qu8_conv_minmax_fp32_sse2_params,
695 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
696 benchmark::utils::CheckXOP);
697 }
698
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)699 static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
700 GEMMEnd2EndBenchmark(state, model,
701 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
702 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
703 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
704 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
705 xnn_init_qu8_conv_minmax_fp32_sse2_params,
706 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
707 benchmark::utils::CheckAVX);
708 }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)709 static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
710 GEMMEnd2EndBenchmark(state, model,
711 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
712 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
713 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
714 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
715 xnn_init_qu8_conv_minmax_fp32_sse2_params,
716 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
717 benchmark::utils::CheckAVX);
718 }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)719 static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
720 GEMMEnd2EndBenchmark(state, model,
721 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
722 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
723 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
724 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
725 xnn_init_qu8_conv_minmax_fp32_sse2_params,
726 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
727 benchmark::utils::CheckAVX);
728 }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)729 static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
730 GEMMEnd2EndBenchmark(state, model,
731 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
732 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
733 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
734 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
735 xnn_init_qu8_conv_minmax_fp32_sse2_params,
736 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
737 benchmark::utils::CheckAVX);
738 }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)739 static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
740 GEMMEnd2EndBenchmark(state, model,
741 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
742 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
743 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
744 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
745 xnn_init_qu8_conv_minmax_fp32_sse2_params,
746 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
747 benchmark::utils::CheckAVX);
748 }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)749 static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
750 GEMMEnd2EndBenchmark(state, model,
751 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
752 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
753 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
754 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
755 xnn_init_qu8_conv_minmax_fp32_sse2_params,
756 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
757 benchmark::utils::CheckAVX);
758 }
759
760
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)761 static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
762 GEMMEnd2EndBenchmark(state, model,
763 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
764 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
765 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
766 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
767 xnn_init_qu8_conv_minmax_fp32_sse2_params,
768 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
769 benchmark::utils::CheckAVX);
770 }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)771 static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
772 GEMMEnd2EndBenchmark(state, model,
773 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
774 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
775 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
776 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
777 xnn_init_qu8_conv_minmax_fp32_sse2_params,
778 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
779 benchmark::utils::CheckAVX);
780 }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)781 static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
782 GEMMEnd2EndBenchmark(state, model,
783 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
784 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
785 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
786 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
787 xnn_init_qu8_conv_minmax_fp32_sse2_params,
788 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
789 benchmark::utils::CheckAVX);
790 }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)791 static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
792 GEMMEnd2EndBenchmark(state, model,
793 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
794 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
795 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
796 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
797 xnn_init_qu8_conv_minmax_fp32_sse2_params,
798 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
799 benchmark::utils::CheckAVX);
800 }
801
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)802 static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
803 GEMMEnd2EndBenchmark(state, model,
804 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
805 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
806 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
807 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
808 xnn_init_qu8_conv_minmax_fp32_sse2_params,
809 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
810 benchmark::utils::CheckSSE41);
811 }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)812 static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
813 GEMMEnd2EndBenchmark(state, model,
814 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
815 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
816 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
817 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
818 xnn_init_qu8_conv_minmax_fp32_sse2_params,
819 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
820 benchmark::utils::CheckSSE41);
821 }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)822 static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
823 GEMMEnd2EndBenchmark(state, model,
824 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
825 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
826 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
827 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
828 xnn_init_qu8_conv_minmax_fp32_sse2_params,
829 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
830 benchmark::utils::CheckSSE41);
831 }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)832 static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
833 GEMMEnd2EndBenchmark(state, model,
834 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
835 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
836 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
837 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
838 xnn_init_qu8_conv_minmax_fp32_sse2_params,
839 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
840 benchmark::utils::CheckSSE41);
841 }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)842 static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
843 GEMMEnd2EndBenchmark(state, model,
844 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
845 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
846 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
847 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
848 xnn_init_qu8_conv_minmax_fp32_sse2_params,
849 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
850 benchmark::utils::CheckSSE41);
851 }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)852 static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
853 GEMMEnd2EndBenchmark(state, model,
854 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
855 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
856 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
857 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
858 xnn_init_qu8_conv_minmax_fp32_sse2_params,
859 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
860 benchmark::utils::CheckSSE41);
861 }
862
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)863 static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
864 GEMMEnd2EndBenchmark(state, model,
865 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
866 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
867 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
868 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
869 xnn_init_qu8_conv_minmax_fp32_sse2_params,
870 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
871 benchmark::utils::CheckSSE41);
872 }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)873 static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
874 GEMMEnd2EndBenchmark(state, model,
875 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
876 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
877 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
878 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
879 xnn_init_qu8_conv_minmax_fp32_sse2_params,
880 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
881 benchmark::utils::CheckSSE41);
882 }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)883 static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
884 GEMMEnd2EndBenchmark(state, model,
885 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
886 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
887 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
888 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
889 xnn_init_qu8_conv_minmax_fp32_sse2_params,
890 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
891 benchmark::utils::CheckSSE41);
892 }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)893 static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
894 GEMMEnd2EndBenchmark(state, model,
895 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
896 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
897 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
898 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
899 xnn_init_qu8_conv_minmax_fp32_sse2_params,
900 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
901 benchmark::utils::CheckSSE41);
902 }
903
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)904 static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
905 GEMMEnd2EndBenchmark(state, model,
906 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
907 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
908 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
909 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
910 xnn_init_qu8_conv_minmax_fp32_sse2_params,
911 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
912 }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)913 static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
914 GEMMEnd2EndBenchmark(state, model,
915 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
916 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
917 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
918 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
919 xnn_init_qu8_conv_minmax_fp32_sse2_params,
920 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
921 }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)922 static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
923 GEMMEnd2EndBenchmark(state, model,
924 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
925 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
926 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
927 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
928 xnn_init_qu8_conv_minmax_fp32_sse2_params,
929 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
930 }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)931 static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
932 GEMMEnd2EndBenchmark(state, model,
933 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
934 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
935 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
936 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
937 xnn_init_qu8_conv_minmax_fp32_sse2_params,
938 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
939 }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)940 static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
941 GEMMEnd2EndBenchmark(state, model,
942 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
943 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
944 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
945 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
946 xnn_init_qu8_conv_minmax_fp32_sse2_params,
947 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
948 }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)949 static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
950 GEMMEnd2EndBenchmark(state, model,
951 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
952 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
953 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
954 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
955 xnn_init_qu8_conv_minmax_fp32_sse2_params,
956 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
957 }
958
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)959 static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
960 GEMMEnd2EndBenchmark(state, model,
961 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
962 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
963 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
964 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
965 xnn_init_qu8_conv_minmax_fp32_sse2_params,
966 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
967 }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)968 static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
969 GEMMEnd2EndBenchmark(state, model,
970 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
971 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
972 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
973 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
974 xnn_init_qu8_conv_minmax_fp32_sse2_params,
975 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
976 }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)977 static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
978 GEMMEnd2EndBenchmark(state, model,
979 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
980 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
981 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
982 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
983 xnn_init_qu8_conv_minmax_fp32_sse2_params,
984 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
985 }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)986 static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
987 GEMMEnd2EndBenchmark(state, model,
988 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
989 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
990 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
991 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
992 xnn_init_qu8_conv_minmax_fp32_sse2_params,
993 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
994 }
995
996
997 BENCHMARK_QU8_END2END(qu8_gemm_2x16c8__avx512skx);
998 BENCHMARK_QU8_END2END(qu8_gemm_3x16c8__avx512skx);
999 BENCHMARK_QU8_END2END(qu8_gemm_4x16c8__avx512skx);
1000
1001 BENCHMARK_QU8_END2END(qu8_gemm_2x8c8__avx2);
1002 BENCHMARK_QU8_END2END(qu8_gemm_3x8c8__avx2);
1003
1004 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld64);
1005 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld128);
1006 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld64);
1007 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld128);
1008 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld64);
1009 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld128);
1010
1011 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld64);
1012 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld128);
1013 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld64);
1014 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld128);
1015
1016 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld64);
1017 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld128);
1018 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld64);
1019 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld128);
1020 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld64);
1021 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld128);
1022
1023 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld64);
1024 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld128);
1025 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld64);
1026 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld128);
1027
1028 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld64);
1029 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld128);
1030 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld64);
1031 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld128);
1032 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld64);
1033 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld128);
1034
1035 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld64);
1036 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld128);
1037 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld64);
1038 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld128);
1039
1040 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld64);
1041 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld128);
1042 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld64);
1043 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld128);
1044 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld64);
1045 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld128);
1046
1047 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld64);
1048 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld128);
1049 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld64);
1050 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld128);
1051 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1052
1053 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1054 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1055 GEMMEnd2EndBenchmark(state, model,
1056 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1057 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1058 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1059 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1060 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1061 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1062 }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1063 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1064 GEMMEnd2EndBenchmark(state, model,
1065 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1066 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1067 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1068 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1069 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1070 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1071 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1072 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1073 GEMMEnd2EndBenchmark(state, model,
1074 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1075 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1076 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1077 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1078 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1079 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1080 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1081 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1082 GEMMEnd2EndBenchmark(state, model,
1083 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1084 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1085 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1086 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1087 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1088 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1089 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1090 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1091 GEMMEnd2EndBenchmark(state, model,
1092 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1093 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1094 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1095 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1096 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1097 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1098 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1099 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1100 GEMMEnd2EndBenchmark(state, model,
1101 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1102 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1103 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1104 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1105 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1106 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1107 }
1108
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1109 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1110 GEMMEnd2EndBenchmark(state, model,
1111 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1112 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1113 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1114 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1115 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1116 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1117 }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1118 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1119 GEMMEnd2EndBenchmark(state, model,
1120 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1121 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1122 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1123 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1124 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1125 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1126 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1127 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1128 GEMMEnd2EndBenchmark(state, model,
1129 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1130 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1131 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1132 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1133 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1134 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1135 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1136 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1137 GEMMEnd2EndBenchmark(state, model,
1138 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1139 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1140 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1141 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1142 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1143 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1144 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1145 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1146 GEMMEnd2EndBenchmark(state, model,
1147 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1148 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1149 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1150 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1151 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1152 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1153 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1154 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1155 GEMMEnd2EndBenchmark(state, model,
1156 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1157 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1158 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1159 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1160 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1161 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1162 }
1163
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1164 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1165 GEMMEnd2EndBenchmark(state, model,
1166 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1167 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1168 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1169 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1170 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1171 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1172 }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1173 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1174 GEMMEnd2EndBenchmark(state, model,
1175 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1176 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1177 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1178 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1179 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1180 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1181 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1182 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1183 GEMMEnd2EndBenchmark(state, model,
1184 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1185 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1186 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1187 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1188 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1189 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1190 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1191 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1192 GEMMEnd2EndBenchmark(state, model,
1193 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1194 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1195 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1196 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1197 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1198 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1199 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1200 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1201 GEMMEnd2EndBenchmark(state, model,
1202 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1203 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1204 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1205 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1206 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1207 4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1208 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1209 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1210 GEMMEnd2EndBenchmark(state, model,
1211 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1212 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1213 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1214 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1215 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1216 4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1217 }
1218
qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1219 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1220 GEMMEnd2EndBenchmark(state, model,
1221 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
1222 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
1223 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1224 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1225 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1226 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1227 }
qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1228 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1229 GEMMEnd2EndBenchmark(state, model,
1230 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
1231 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
1232 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1233 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1234 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1235 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1236 }
qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1237 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1238 GEMMEnd2EndBenchmark(state, model,
1239 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
1240 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
1241 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1242 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1243 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1244 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1245 }
qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1246 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1247 GEMMEnd2EndBenchmark(state, model,
1248 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
1249 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
1250 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1251 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1252 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1253 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1254 }
1255
1256 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)1257 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1258 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1259 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1260 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1261 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1262
1263 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1264 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1265 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1266 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1267 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1268 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1269
1270 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1271 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1272 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1273 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1274 BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1275 BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1276
1277 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_mul32_ld64)
1278 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_mul32_ld128)
1279 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_mul32_ld64)
1280 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_mul32_ld128)
1281 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1282
1283
1284 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1285 static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1286 GEMMEnd2EndBenchmark(state, model,
1287 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1288 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1289 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1290 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1291 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1292 2 /* mr */, 2 /* nr */);
1293 }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1294 static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1295 GEMMEnd2EndBenchmark(state, model,
1296 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1297 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1298 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1299 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1300 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1301 3 /* mr */, 2 /* nr */);
1302 }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1303 static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1304 GEMMEnd2EndBenchmark(state, model,
1305 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1306 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1307 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1308 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1309 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1310 4 /* mr */, 2 /* nr */);
1311 }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1312 static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1313 GEMMEnd2EndBenchmark(state, model,
1314 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1315 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1316 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1317 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1318 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1319 2 /* mr */, 4 /* nr */);
1320 }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1321 static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1322 GEMMEnd2EndBenchmark(state, model,
1323 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1324 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1325 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1326 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1327 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1328 3 /* mr */, 4 /* nr */);
1329 }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1330 static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1331 GEMMEnd2EndBenchmark(state, model,
1332 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1333 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1334 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1335 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1336 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1337 4 /* mr */, 4 /* nr */);
1338 }
1339
1340 BENCHMARK_QU8_END2END(qu8_gemm_2x2__wasm_fmagic)
BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)1341 BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)
1342 BENCHMARK_QU8_END2END(qu8_gemm_4x2__wasm_fmagic)
1343 BENCHMARK_QU8_END2END(qu8_gemm_2x4__wasm_fmagic)
1344 BENCHMARK_QU8_END2END(qu8_gemm_3x4__wasm_fmagic)
1345 BENCHMARK_QU8_END2END(qu8_gemm_4x4__wasm_fmagic)
1346 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1347
1348
1349 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1350 GEMMEnd2EndBenchmark(state, model,
1351 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1352 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1353 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1354 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1355 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1356 2 /* mr */, 2 /* nr */);
1357 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1358 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1359 GEMMEnd2EndBenchmark(state, model,
1360 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1361 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1362 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1363 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1364 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1365 3 /* mr */, 2 /* nr */);
1366 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1367 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1368 GEMMEnd2EndBenchmark(state, model,
1369 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1370 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1371 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1372 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1373 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1374 4 /* mr */, 2 /* nr */);
1375 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1376 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1377 GEMMEnd2EndBenchmark(state, model,
1378 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1379 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1380 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1381 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1382 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1383 2 /* mr */, 4 /* nr */);
1384 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1385 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1386 GEMMEnd2EndBenchmark(state, model,
1387 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1388 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1389 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1390 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1391 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1392 3 /* mr */, 4 /* nr */);
1393 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1394 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1395 GEMMEnd2EndBenchmark(state, model,
1396 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1397 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1398 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1399 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1400 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1401 4 /* mr */, 4 /* nr */);
1402 }
1403
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1404 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1405 GEMMEnd2EndBenchmark(state, model,
1406 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1407 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1408 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1409 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1410 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1411 2 /* mr */, 2 /* nr */);
1412 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1413 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1414 GEMMEnd2EndBenchmark(state, model,
1415 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1416 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1417 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1418 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1419 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1420 3 /* mr */, 2 /* nr */);
1421 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1422 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1423 GEMMEnd2EndBenchmark(state, model,
1424 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1425 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1426 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1427 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1428 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1429 4 /* mr */, 2 /* nr */);
1430 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1431 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1432 GEMMEnd2EndBenchmark(state, model,
1433 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1434 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1435 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1436 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1437 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1438 2 /* mr */, 4 /* nr */);
1439 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1440 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1441 GEMMEnd2EndBenchmark(state, model,
1442 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1443 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1444 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1445 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1446 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1447 3 /* mr */, 4 /* nr */);
1448 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1449 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1450 GEMMEnd2EndBenchmark(state, model,
1451 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1452 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1453 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1454 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1455 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1456 4 /* mr */, 4 /* nr */);
1457 }
1458
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1459 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1460 GEMMEnd2EndBenchmark(state, model,
1461 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1462 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1463 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1464 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1465 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1466 2 /* mr */, 2 /* nr */);
1467 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1468 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1469 GEMMEnd2EndBenchmark(state, model,
1470 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1471 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1472 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1473 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1474 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1475 3 /* mr */, 2 /* nr */);
1476 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1477 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1478 GEMMEnd2EndBenchmark(state, model,
1479 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1480 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1481 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1482 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1483 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1484 4 /* mr */, 2 /* nr */);
1485 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1486 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1487 GEMMEnd2EndBenchmark(state, model,
1488 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1489 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1490 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1491 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1492 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1493 2 /* mr */, 4 /* nr */);
1494 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1495 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1496 GEMMEnd2EndBenchmark(state, model,
1497 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1498 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1499 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1500 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1501 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1502 3 /* mr */, 4 /* nr */);
1503 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1504 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1505 GEMMEnd2EndBenchmark(state, model,
1506 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1507 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1508 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1509 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1510 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1511 4 /* mr */, 4 /* nr */);
1512 }
1513
1514 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_fmagic)
1515 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_fmagic)
1516 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_fmagic)
1517 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_fmagic)
1518 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_fmagic)
1519 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_fmagic)
1520
1521 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_imagic)
1522 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_imagic)
1523 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_imagic)
1524 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_imagic)
1525 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_imagic)
1526 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_imagic)
1527
1528 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_lrintf)
1529 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_lrintf)
1530 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_lrintf)
1531 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_lrintf)
1532 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_lrintf)
1533 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_lrintf)
1534
1535 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1536 BENCHMARK_MAIN();
1537 #endif
1538