1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <xnnpack.h>
13
14 #include <benchmark/benchmark.h>
15
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/dwconv.h>
20 #include <xnnpack/params.h>
21 #include <xnnpack/params-init.h>
22
23
DWConvEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,xnn_init_qs8_conv_minmax_params_fn init_params,uint8_t channel_tile,uint8_t primary_tile,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void DWConvEnd2EndBenchmark(
25 benchmark::State& state,
26 models::ExecutionPlanFactory model_factory,
27 xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,
28 xnn_init_qs8_conv_minmax_params_fn init_params,
29 uint8_t channel_tile, uint8_t primary_tile,
30 benchmark::utils::IsaCheckFunction isa_check = nullptr)
31 {
32 if (isa_check && !isa_check(state)) {
33 return;
34 }
35 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
36 state.SkipWithError("failed to initialize XNNPACK");
37 return;
38 }
39
40 // Override microkernels chosen in xnn_initialize
41 for (size_t i = 0; i < XNN_MAX_QS8_DWCONV_UKERNELS; i++) {
42 // Replace only the microkernel the matching kernel size.
43 if (xnn_params.qs8.dwconv[i].primary_tile == primary_tile) {
44 // Note: do not directly assign to xnn_params.qs8.dwconv[i] because it breaks older gcc.
45 xnn_params.qs8.dwconv[i].minmax.unipass = xnn_dwconv_unipass_ukernel_function(dwconv);
46 xnn_params.qs8.dwconv[i].channel_tile = channel_tile;
47 xnn_params.qs8.dwconv[i].primary_tile = primary_tile;
48 xnn_params.qs8.dwconv[i].incremental_tile = 0;
49 xnn_params.qs8.dwconv[i].init.qs8 = init_params;
50 break;
51 }
52 }
53
54 auto execution_plan = model_factory(nullptr);
55 if (execution_plan.empty()) {
56 state.SkipWithError("failed to create a model");
57 return;
58 }
59
60 for (auto _ : state) {
61 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
62 xnn_status status = xnn_run_operator(op.get(), nullptr);
63 if (status != xnn_status_success) {
64 state.SkipWithError("failed to run a model");
65 return;
66 }
67 }
68 }
69
70 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
71 if (cpu_frequency != 0) {
72 state.counters["cpufreq"] = cpu_frequency;
73 }
74 }
75
76
77 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)78 static void qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
79 DWConvEnd2EndBenchmark(state, model,
80 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul8_ld64,
81 xnn_init_qs8_conv_minmax_rndnu_neon_params,
82 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
83 }
qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)84 static void qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
85 DWConvEnd2EndBenchmark(state, model,
86 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld64,
87 xnn_init_qs8_conv_minmax_rndnu_neon_params,
88 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
89 }
qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State & state,models::ExecutionPlanFactory model)90 static void qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
91 DWConvEnd2EndBenchmark(state, model,
92 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld128,
93 xnn_init_qs8_conv_minmax_rndnu_neon_params,
94 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
95 }
qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)96 static void qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
97 DWConvEnd2EndBenchmark(state, model,
98 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mla8_ld64,
99 xnn_init_qs8_conv_minmax_rndnu_neon_params,
100 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
101 }
qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)102 static void qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
103 DWConvEnd2EndBenchmark(state, model,
104 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64,
105 xnn_init_qs8_conv_minmax_rndnu_neon_params,
106 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
107 }
qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State & state,models::ExecutionPlanFactory model)108 static void qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
109 DWConvEnd2EndBenchmark(state, model,
110 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld128,
111 xnn_init_qs8_conv_minmax_rndnu_neon_params,
112 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
113 }
qs8_dwconv_up8x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)114 static void qs8_dwconv_up8x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
115 DWConvEnd2EndBenchmark(state, model,
116 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16,
117 xnn_init_qs8_conv_minmax_rndnu_neon_params,
118 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
119 }
qs8_dwconv_up16x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)120 static void qs8_dwconv_up16x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
121 DWConvEnd2EndBenchmark(state, model,
122 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul16,
123 xnn_init_qs8_conv_minmax_rndnu_neon_params,
124 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
125 }
qs8_dwconv_up24x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)126 static void qs8_dwconv_up24x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
127 DWConvEnd2EndBenchmark(state, model,
128 xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x9__neon_mul16,
129 xnn_init_qs8_conv_minmax_rndnu_neon_params,
130 24 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
131 }
qs8_dwconv_up32x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)132 static void qs8_dwconv_up32x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
133 DWConvEnd2EndBenchmark(state, model,
134 xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x9__neon_mul16,
135 xnn_init_qs8_conv_minmax_rndnu_neon_params,
136 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
137 }
138
139 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__neon_mul8_ld64);
140 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mul8_ld64);
141 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mul8_ld128);
142 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__neon_mla8_ld64);
143 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mla8_ld64);
144 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mla8_ld128);
145 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__neon_mul16);
146 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mul16);
147 BENCHMARK_QS8_END2END(qs8_dwconv_up24x9__neon_mul16);
148 BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__neon_mul16);
149 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
150
151
152 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)153 static void qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
154 DWConvEnd2EndBenchmark(state, model,
155 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx512skx_mul32,
156 xnn_init_qs8_conv_minmax_fp32_avx512_params,
157 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
158 }
qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)159 static void qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
160 DWConvEnd2EndBenchmark(state, model,
161 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32,
162 xnn_init_qs8_conv_minmax_fp32_avx512_params,
163 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
164 }
qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State & state,models::ExecutionPlanFactory model)165 static void qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) {
166 DWConvEnd2EndBenchmark(state, model,
167 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpmovsx,
168 xnn_init_qs8_conv_minmax_fp32_avx2_params,
169 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
170 }
qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State & state,models::ExecutionPlanFactory model)171 static void qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) {
172 DWConvEnd2EndBenchmark(state, model,
173 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpmovsx,
174 xnn_init_qs8_conv_minmax_fp32_avx2_params,
175 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
176 }
qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)177 static void qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
178 DWConvEnd2EndBenchmark(state, model,
179 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpunpck,
180 xnn_init_qs8_conv_minmax_fp32_avx2_params,
181 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
182 }
qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)183 static void qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
184 DWConvEnd2EndBenchmark(state, model,
185 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpunpck,
186 xnn_init_qs8_conv_minmax_fp32_avx2_params,
187 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
188 }
qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)189 static void qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
190 DWConvEnd2EndBenchmark(state, model,
191 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_add16_vpunpck,
192 xnn_init_qs8_conv_minmax_fp32_avx2_params,
193 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
194 }
qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)195 static void qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
196 DWConvEnd2EndBenchmark(state, model,
197 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck,
198 xnn_init_qs8_conv_minmax_fp32_avx2_params,
199 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
200 }
qs8_dwconv_up8x9__avx2_mul32(benchmark::State & state,models::ExecutionPlanFactory model)201 static void qs8_dwconv_up8x9__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
202 DWConvEnd2EndBenchmark(state, model,
203 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx2_mul32,
204 xnn_init_qs8_conv_minmax_fp32_avx2_params,
205 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
206 }
qs8_dwconv_up16x9__avx2_mul32(benchmark::State & state,models::ExecutionPlanFactory model)207 static void qs8_dwconv_up16x9__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
208 DWConvEnd2EndBenchmark(state, model,
209 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32,
210 xnn_init_qs8_conv_minmax_fp32_avx2_params,
211 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
212 }
qs8_dwconv_up32x9__avx2_mul32(benchmark::State & state,models::ExecutionPlanFactory model)213 static void qs8_dwconv_up32x9__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
214 DWConvEnd2EndBenchmark(state, model,
215 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul32,
216 xnn_init_qs8_conv_minmax_fp32_avx2_params,
217 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
218 }
qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)219 static void qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
220 DWConvEnd2EndBenchmark(state, model,
221 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16,
222 xnn_init_qs8_conv_minmax_fp32_sse4_params,
223 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
224 }
qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)225 static void qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
226 DWConvEnd2EndBenchmark(state, model,
227 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16,
228 xnn_init_qs8_conv_minmax_fp32_sse4_params,
229 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
230 }
qs8_dwconv_up8x9__xop_mul32(benchmark::State & state,models::ExecutionPlanFactory model)231 static void qs8_dwconv_up8x9__xop_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
232 DWConvEnd2EndBenchmark(state, model,
233 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32,
234 xnn_init_qs8_conv_minmax_fp32_sse4_params,
235 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
236 }
qs8_dwconv_up16x9__xop_mul32(benchmark::State & state,models::ExecutionPlanFactory model)237 static void qs8_dwconv_up16x9__xop_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
238 DWConvEnd2EndBenchmark(state, model,
239 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32,
240 xnn_init_qs8_conv_minmax_fp32_sse4_params,
241 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
242 }
qs8_dwconv_up8x9__avx_mul16(benchmark::State & state,models::ExecutionPlanFactory model)243 static void qs8_dwconv_up8x9__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
244 DWConvEnd2EndBenchmark(state, model,
245 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16,
246 xnn_init_qs8_conv_minmax_fp32_sse4_params,
247 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
248 }
qs8_dwconv_up16x9__avx_mul16(benchmark::State & state,models::ExecutionPlanFactory model)249 static void qs8_dwconv_up16x9__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
250 DWConvEnd2EndBenchmark(state, model,
251 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16,
252 xnn_init_qs8_conv_minmax_fp32_sse4_params,
253 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
254 }
qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)255 static void qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
256 DWConvEnd2EndBenchmark(state, model,
257 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16,
258 xnn_init_qs8_conv_minmax_fp32_sse4_params,
259 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
260 }
qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)261 static void qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
262 DWConvEnd2EndBenchmark(state, model,
263 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16,
264 xnn_init_qs8_conv_minmax_fp32_sse4_params,
265 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
266 }
qs8_dwconv_up8x9__avx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)267 static void qs8_dwconv_up8x9__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
268 DWConvEnd2EndBenchmark(state, model,
269 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32,
270 xnn_init_qs8_conv_minmax_fp32_sse4_params,
271 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
272 }
qs8_dwconv_up16x9__avx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)273 static void qs8_dwconv_up16x9__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
274 DWConvEnd2EndBenchmark(state, model,
275 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32,
276 xnn_init_qs8_conv_minmax_fp32_sse4_params,
277 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
278 }
qs8_dwconv_up8x9__sse41_mul16(benchmark::State & state,models::ExecutionPlanFactory model)279 static void qs8_dwconv_up8x9__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
280 DWConvEnd2EndBenchmark(state, model,
281 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16,
282 xnn_init_qs8_conv_minmax_fp32_sse4_params,
283 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
284 }
qs8_dwconv_up16x9__sse41_mul16(benchmark::State & state,models::ExecutionPlanFactory model)285 static void qs8_dwconv_up16x9__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
286 DWConvEnd2EndBenchmark(state, model,
287 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16,
288 xnn_init_qs8_conv_minmax_fp32_sse4_params,
289 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
290 }
qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)291 static void qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
292 DWConvEnd2EndBenchmark(state, model,
293 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16,
294 xnn_init_qs8_conv_minmax_fp32_sse4_params,
295 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
296 }
qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)297 static void qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
298 DWConvEnd2EndBenchmark(state, model,
299 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16,
300 xnn_init_qs8_conv_minmax_fp32_sse4_params,
301 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
302 }
qs8_dwconv_up8x9__sse41_mul32(benchmark::State & state,models::ExecutionPlanFactory model)303 static void qs8_dwconv_up8x9__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
304 DWConvEnd2EndBenchmark(state, model,
305 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32,
306 xnn_init_qs8_conv_minmax_fp32_sse4_params,
307 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
308 }
qs8_dwconv_up16x9__sse41_mul32(benchmark::State & state,models::ExecutionPlanFactory model)309 static void qs8_dwconv_up16x9__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
310 DWConvEnd2EndBenchmark(state, model,
311 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32,
312 xnn_init_qs8_conv_minmax_fp32_sse4_params,
313 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
314 }
qs8_dwconv_up8x9__sse2_mul16(benchmark::State & state,models::ExecutionPlanFactory model)315 static void qs8_dwconv_up8x9__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
316 DWConvEnd2EndBenchmark(state, model,
317 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16,
318 xnn_init_qs8_conv_minmax_fp32_sse2_params,
319 8 /* channel tile */, 9 /* primary tile */);
320 }
qs8_dwconv_up16x9__sse2_mul16(benchmark::State & state,models::ExecutionPlanFactory model)321 static void qs8_dwconv_up16x9__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
322 DWConvEnd2EndBenchmark(state, model,
323 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16,
324 xnn_init_qs8_conv_minmax_fp32_sse2_params,
325 16 /* channel tile */, 9 /* primary tile */);
326 }
qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)327 static void qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
328 DWConvEnd2EndBenchmark(state, model,
329 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16,
330 xnn_init_qs8_conv_minmax_fp32_sse2_params,
331 8 /* channel tile */, 9 /* primary tile */);
332 }
qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)333 static void qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
334 DWConvEnd2EndBenchmark(state, model,
335 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16,
336 xnn_init_qs8_conv_minmax_fp32_sse2_params,
337 16 /* channel tile */, 9 /* primary tile */);
338 }
339
340 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx512skx_mul32);
341 BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx512skx_mul32);
342
343 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul16_vpmovsx);
344 BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul16_vpmovsx);
345 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul16_vpunpck);
346 BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul16_vpunpck);
347 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck);
348 BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck);
349 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx2_mul32);
350 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul32);
351 BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul32);
352
353 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__xop_mul16_add16);
354 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__xop_mul16_add16);
355 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__xop_mul32);
356 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__xop_mul32);
357
358 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx_mul16);
359 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx_mul16);
360 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx_mul16_add16);
361 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx_mul16_add16);
362 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx_mul32);
363 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx_mul32);
364
365 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse41_mul16);
366 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse41_mul16);
367 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse41_mul16_add16);
368 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse41_mul16_add16);
369 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse41_mul32);
370 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse41_mul32);
371
372 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse2_mul16);
373 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse2_mul16);
374 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse2_mul16_add16);
375 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse2_mul16_add16);
376 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
377
378
379 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State & state,models::ExecutionPlanFactory model)380 static void qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
381 DWConvEnd2EndBenchmark(state, model,
382 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16,
383 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
384 8 /* channel tile */, 9 /* primary tile */);
385 }
qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State & state,models::ExecutionPlanFactory model)386 static void qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
387 DWConvEnd2EndBenchmark(state, model,
388 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16,
389 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
390 16 /* channel tile */, 9 /* primary tile */);
391 }
392
qs8_dwconv_up8x9__wasmsimd_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)393 static void qs8_dwconv_up8x9__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
394 DWConvEnd2EndBenchmark(state, model,
395 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16_add16,
396 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
397 8 /* channel tile */, 9 /* primary tile */);
398 }
qs8_dwconv_up16x9__wasmsimd_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)399 static void qs8_dwconv_up16x9__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
400 DWConvEnd2EndBenchmark(state, model,
401 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16,
402 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
403 16 /* channel tile */, 9 /* primary tile */);
404 }
405
406 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__wasmsimd_mul16);
407 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__wasmsimd_mul16);
408
409 BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__wasmsimd_mul16_add16);
410 BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__wasmsimd_mul16_add16);
411 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
412
413
414 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up1x9__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)415 static void qs8_dwconv_up1x9__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
416 DWConvEnd2EndBenchmark(state, model,
417 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__wasm_fmagic,
418 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
419 1 /* channel tile */, 9 /* primary tile */);
420 }
qs8_dwconv_up2x9__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)421 static void qs8_dwconv_up2x9__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
422 DWConvEnd2EndBenchmark(state, model,
423 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic,
424 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
425 2 /* channel tile */, 9 /* primary tile */);
426 }
qs8_dwconv_up4x9__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)427 static void qs8_dwconv_up4x9__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
428 DWConvEnd2EndBenchmark(state, model,
429 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__wasm_fmagic,
430 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
431 4 /* channel tile */, 9 /* primary tile */);
432 }
433
434 BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__wasm_fmagic);
435 BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__wasm_fmagic);
436 BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__wasm_fmagic);
437 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
438
439
qs8_dwconv_up1x9__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)440 static void qs8_dwconv_up1x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
441 DWConvEnd2EndBenchmark(state, model,
442 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic,
443 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
444 1 /* channel tile */, 9 /* primary tile */);
445 }
qs8_dwconv_up2x9__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)446 static void qs8_dwconv_up2x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
447 DWConvEnd2EndBenchmark(state, model,
448 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_fmagic,
449 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
450 2 /* channel tile */, 9 /* primary tile */);
451 }
qs8_dwconv_up4x9__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)452 static void qs8_dwconv_up4x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
453 DWConvEnd2EndBenchmark(state, model,
454 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_fmagic,
455 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
456 4 /* channel tile */, 9 /* primary tile */);
457 }
458
qs8_dwconv_up1x9__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)459 static void qs8_dwconv_up1x9__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
460 DWConvEnd2EndBenchmark(state, model,
461 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_imagic,
462 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
463 1 /* channel tile */, 9 /* primary tile */);
464 }
qs8_dwconv_up2x9__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)465 static void qs8_dwconv_up2x9__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
466 DWConvEnd2EndBenchmark(state, model,
467 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic,
468 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
469 2 /* channel tile */, 9 /* primary tile */);
470 }
qs8_dwconv_up4x9__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)471 static void qs8_dwconv_up4x9__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
472 DWConvEnd2EndBenchmark(state, model,
473 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_imagic,
474 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
475 4 /* channel tile */, 9 /* primary tile */);
476 }
477
qs8_dwconv_up1x9__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)478 static void qs8_dwconv_up1x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
479 DWConvEnd2EndBenchmark(state, model,
480 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrintf,
481 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
482 1 /* channel tile */, 9 /* primary tile */);
483 }
qs8_dwconv_up2x9__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)484 static void qs8_dwconv_up2x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
485 DWConvEnd2EndBenchmark(state, model,
486 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf,
487 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
488 2 /* channel tile */, 9 /* primary tile */);
489 }
qs8_dwconv_up4x9__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)490 static void qs8_dwconv_up4x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
491 DWConvEnd2EndBenchmark(state, model,
492 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrintf,
493 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
494 4 /* channel tile */, 9 /* primary tile */);
495 }
496
497 BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_fmagic);
498 BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_fmagic);
499 BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_fmagic);
500
501 BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_imagic);
502 BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_imagic);
503 BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_imagic);
504
505 BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_lrintf);
506 BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_lrintf);
507 BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_lrintf);
508
509
510 #ifndef XNNPACK_BENCHMARK_NO_MAIN
511 BENCHMARK_MAIN();
512 #endif
513