• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <xnnpack.h>
13 
14 #include <benchmark/benchmark.h>
15 
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/dwconv.h>
20 #include <xnnpack/params.h>
21 #include <xnnpack/params-init.h>
22 
23 
DWConvEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,xnn_init_qs8_conv_minmax_params_fn init_params,uint8_t channel_tile,uint8_t primary_tile,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void DWConvEnd2EndBenchmark(
25   benchmark::State& state,
26   models::ExecutionPlanFactory model_factory,
27   xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,
28   xnn_init_qs8_conv_minmax_params_fn init_params,
29   uint8_t channel_tile, uint8_t primary_tile,
30   benchmark::utils::IsaCheckFunction isa_check = nullptr)
31 {
32   if (isa_check && !isa_check(state)) {
33     return;
34   }
35   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
36     state.SkipWithError("failed to initialize XNNPACK");
37     return;
38   }
39 
40   // Override microkernels chosen in xnn_initialize
41   for (size_t i = 0; i < XNN_MAX_QS8_DWCONV_UKERNELS; i++) {
42     // Replace only the microkernel the matching kernel size.
43     if (xnn_params.qs8.dwconv[i].primary_tile == primary_tile) {
44       // Note: do not directly assign to xnn_params.qs8.dwconv[i] because it breaks older gcc.
45       xnn_params.qs8.dwconv[i].minmax.unipass = xnn_dwconv_unipass_ukernel_function(dwconv);
46       xnn_params.qs8.dwconv[i].channel_tile = channel_tile;
47       xnn_params.qs8.dwconv[i].primary_tile = primary_tile;
48       xnn_params.qs8.dwconv[i].incremental_tile = 0;
49       xnn_params.qs8.dwconv[i].init.qs8 = init_params;
50       break;
51     }
52   }
53 
54   auto execution_plan = model_factory(nullptr);
55   if (execution_plan.empty()) {
56     state.SkipWithError("failed to create a model");
57     return;
58   }
59 
60   for (auto _ : state) {
61     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
62       xnn_status status = xnn_run_operator(op.get(), nullptr);
63       if (status != xnn_status_success) {
64         state.SkipWithError("failed to run a model");
65         return;
66       }
67     }
68   }
69 
70   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
71   if (cpu_frequency != 0) {
72     state.counters["cpufreq"] = cpu_frequency;
73   }
74 }
75 
76 
77 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)78   static void qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
79     DWConvEnd2EndBenchmark(state, model,
80       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul8_ld64,
81       xnn_init_qs8_conv_minmax_rndnu_neon_params,
82       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
83   }
qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)84   static void qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
85     DWConvEnd2EndBenchmark(state, model,
86       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld64,
87       xnn_init_qs8_conv_minmax_rndnu_neon_params,
88       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
89   }
qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State & state,models::ExecutionPlanFactory model)90   static void qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
91     DWConvEnd2EndBenchmark(state, model,
92       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld128,
93       xnn_init_qs8_conv_minmax_rndnu_neon_params,
94       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
95   }
qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)96   static void qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
97     DWConvEnd2EndBenchmark(state, model,
98       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mla8_ld64,
99       xnn_init_qs8_conv_minmax_rndnu_neon_params,
100       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
101   }
qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State & state,models::ExecutionPlanFactory model)102   static void qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
103     DWConvEnd2EndBenchmark(state, model,
104       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64,
105       xnn_init_qs8_conv_minmax_rndnu_neon_params,
106       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
107   }
qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State & state,models::ExecutionPlanFactory model)108   static void qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
109     DWConvEnd2EndBenchmark(state, model,
110       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld128,
111       xnn_init_qs8_conv_minmax_rndnu_neon_params,
112       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
113   }
qs8_dwconv_up8x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)114   static void qs8_dwconv_up8x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
115     DWConvEnd2EndBenchmark(state, model,
116       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16,
117       xnn_init_qs8_conv_minmax_rndnu_neon_params,
118       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
119   }
qs8_dwconv_up16x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)120   static void qs8_dwconv_up16x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
121     DWConvEnd2EndBenchmark(state, model,
122       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul16,
123       xnn_init_qs8_conv_minmax_rndnu_neon_params,
124       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
125   }
qs8_dwconv_up24x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)126   static void qs8_dwconv_up24x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
127     DWConvEnd2EndBenchmark(state, model,
128       xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x9__neon_mul16,
129       xnn_init_qs8_conv_minmax_rndnu_neon_params,
130       24 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
131   }
qs8_dwconv_up32x9__neon_mul16(benchmark::State & state,models::ExecutionPlanFactory model)132   static void qs8_dwconv_up32x9__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
133     DWConvEnd2EndBenchmark(state, model,
134       xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x9__neon_mul16,
135       xnn_init_qs8_conv_minmax_rndnu_neon_params,
136       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
137   }
138 
139   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__neon_mul8_ld64);
140   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mul8_ld64);
141   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mul8_ld128);
142   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__neon_mla8_ld64);
143   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mla8_ld64);
144   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mla8_ld128);
145   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__neon_mul16);
146   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__neon_mul16);
147   BENCHMARK_QS8_END2END(qs8_dwconv_up24x9__neon_mul16);
148   BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__neon_mul16);
149 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
150 
151 
152 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)153   static void qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
154     DWConvEnd2EndBenchmark(state, model,
155       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx512skx_mul32,
156       xnn_init_qs8_conv_minmax_fp32_avx512_params,
157       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
158   }
qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)159   static void qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
160     DWConvEnd2EndBenchmark(state, model,
161       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32,
162       xnn_init_qs8_conv_minmax_fp32_avx512_params,
163       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
164   }
qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State & state,models::ExecutionPlanFactory model)165   static void qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) {
166     DWConvEnd2EndBenchmark(state, model,
167       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpmovsx,
168       xnn_init_qs8_conv_minmax_fp32_avx2_params,
169       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
170   }
qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State & state,models::ExecutionPlanFactory model)171   static void qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) {
172     DWConvEnd2EndBenchmark(state, model,
173       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpmovsx,
174       xnn_init_qs8_conv_minmax_fp32_avx2_params,
175       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
176   }
qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)177   static void qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
178     DWConvEnd2EndBenchmark(state, model,
179       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpunpck,
180       xnn_init_qs8_conv_minmax_fp32_avx2_params,
181       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
182   }
qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)183   static void qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
184     DWConvEnd2EndBenchmark(state, model,
185       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpunpck,
186       xnn_init_qs8_conv_minmax_fp32_avx2_params,
187       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
188   }
qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)189   static void qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
190     DWConvEnd2EndBenchmark(state, model,
191       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_add16_vpunpck,
192       xnn_init_qs8_conv_minmax_fp32_avx2_params,
193       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
194   }
qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State & state,models::ExecutionPlanFactory model)195   static void qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) {
196     DWConvEnd2EndBenchmark(state, model,
197       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck,
198       xnn_init_qs8_conv_minmax_fp32_avx2_params,
199       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
200   }
qs8_dwconv_up8x9__avx2_mul32(benchmark::State & state,models::ExecutionPlanFactory model)201   static void qs8_dwconv_up8x9__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
202     DWConvEnd2EndBenchmark(state, model,
203       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx2_mul32,
204       xnn_init_qs8_conv_minmax_fp32_avx2_params,
205       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
206   }
qs8_dwconv_up16x9__avx2_mul32(benchmark::State & state,models::ExecutionPlanFactory model)207   static void qs8_dwconv_up16x9__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
208     DWConvEnd2EndBenchmark(state, model,
209       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32,
210       xnn_init_qs8_conv_minmax_fp32_avx2_params,
211       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
212   }
qs8_dwconv_up32x9__avx2_mul32(benchmark::State & state,models::ExecutionPlanFactory model)213   static void qs8_dwconv_up32x9__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
214     DWConvEnd2EndBenchmark(state, model,
215       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul32,
216       xnn_init_qs8_conv_minmax_fp32_avx2_params,
217       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
218   }
qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)219   static void qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
220     DWConvEnd2EndBenchmark(state, model,
221       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16,
222       xnn_init_qs8_conv_minmax_fp32_sse4_params,
223       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
224   }
qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)225   static void qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
226     DWConvEnd2EndBenchmark(state, model,
227       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16,
228       xnn_init_qs8_conv_minmax_fp32_sse4_params,
229       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
230   }
qs8_dwconv_up8x9__xop_mul32(benchmark::State & state,models::ExecutionPlanFactory model)231   static void qs8_dwconv_up8x9__xop_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
232     DWConvEnd2EndBenchmark(state, model,
233       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32,
234       xnn_init_qs8_conv_minmax_fp32_sse4_params,
235       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
236   }
qs8_dwconv_up16x9__xop_mul32(benchmark::State & state,models::ExecutionPlanFactory model)237   static void qs8_dwconv_up16x9__xop_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
238     DWConvEnd2EndBenchmark(state, model,
239       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32,
240       xnn_init_qs8_conv_minmax_fp32_sse4_params,
241       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
242   }
qs8_dwconv_up8x9__avx_mul16(benchmark::State & state,models::ExecutionPlanFactory model)243   static void qs8_dwconv_up8x9__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
244     DWConvEnd2EndBenchmark(state, model,
245       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16,
246       xnn_init_qs8_conv_minmax_fp32_sse4_params,
247       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
248   }
qs8_dwconv_up16x9__avx_mul16(benchmark::State & state,models::ExecutionPlanFactory model)249   static void qs8_dwconv_up16x9__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
250     DWConvEnd2EndBenchmark(state, model,
251       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16,
252       xnn_init_qs8_conv_minmax_fp32_sse4_params,
253       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
254   }
qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)255   static void qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
256     DWConvEnd2EndBenchmark(state, model,
257       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16,
258       xnn_init_qs8_conv_minmax_fp32_sse4_params,
259       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
260   }
qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)261   static void qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
262     DWConvEnd2EndBenchmark(state, model,
263       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16,
264       xnn_init_qs8_conv_minmax_fp32_sse4_params,
265       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
266   }
qs8_dwconv_up8x9__avx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)267   static void qs8_dwconv_up8x9__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
268     DWConvEnd2EndBenchmark(state, model,
269       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32,
270       xnn_init_qs8_conv_minmax_fp32_sse4_params,
271       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
272   }
qs8_dwconv_up16x9__avx_mul32(benchmark::State & state,models::ExecutionPlanFactory model)273   static void qs8_dwconv_up16x9__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
274     DWConvEnd2EndBenchmark(state, model,
275       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32,
276       xnn_init_qs8_conv_minmax_fp32_sse4_params,
277       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
278   }
qs8_dwconv_up8x9__sse41_mul16(benchmark::State & state,models::ExecutionPlanFactory model)279   static void qs8_dwconv_up8x9__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
280     DWConvEnd2EndBenchmark(state, model,
281       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16,
282       xnn_init_qs8_conv_minmax_fp32_sse4_params,
283       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
284   }
qs8_dwconv_up16x9__sse41_mul16(benchmark::State & state,models::ExecutionPlanFactory model)285   static void qs8_dwconv_up16x9__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
286     DWConvEnd2EndBenchmark(state, model,
287       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16,
288       xnn_init_qs8_conv_minmax_fp32_sse4_params,
289       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
290   }
qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)291   static void qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
292     DWConvEnd2EndBenchmark(state, model,
293       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16,
294       xnn_init_qs8_conv_minmax_fp32_sse4_params,
295       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
296   }
qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)297   static void qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
298     DWConvEnd2EndBenchmark(state, model,
299       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16,
300       xnn_init_qs8_conv_minmax_fp32_sse4_params,
301       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
302   }
qs8_dwconv_up8x9__sse41_mul32(benchmark::State & state,models::ExecutionPlanFactory model)303   static void qs8_dwconv_up8x9__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
304     DWConvEnd2EndBenchmark(state, model,
305       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32,
306       xnn_init_qs8_conv_minmax_fp32_sse4_params,
307       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
308   }
qs8_dwconv_up16x9__sse41_mul32(benchmark::State & state,models::ExecutionPlanFactory model)309   static void qs8_dwconv_up16x9__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) {
310     DWConvEnd2EndBenchmark(state, model,
311       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32,
312       xnn_init_qs8_conv_minmax_fp32_sse4_params,
313       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
314   }
qs8_dwconv_up8x9__sse2_mul16(benchmark::State & state,models::ExecutionPlanFactory model)315   static void qs8_dwconv_up8x9__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
316     DWConvEnd2EndBenchmark(state, model,
317       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16,
318       xnn_init_qs8_conv_minmax_fp32_sse2_params,
319       8 /* channel tile */, 9 /* primary tile */);
320   }
qs8_dwconv_up16x9__sse2_mul16(benchmark::State & state,models::ExecutionPlanFactory model)321   static void qs8_dwconv_up16x9__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
322     DWConvEnd2EndBenchmark(state, model,
323       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16,
324       xnn_init_qs8_conv_minmax_fp32_sse2_params,
325       16 /* channel tile */, 9 /* primary tile */);
326   }
qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)327   static void qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
328     DWConvEnd2EndBenchmark(state, model,
329       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16,
330       xnn_init_qs8_conv_minmax_fp32_sse2_params,
331       8 /* channel tile */, 9 /* primary tile */);
332   }
qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)333   static void qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
334     DWConvEnd2EndBenchmark(state, model,
335       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16,
336       xnn_init_qs8_conv_minmax_fp32_sse2_params,
337       16 /* channel tile */, 9 /* primary tile */);
338   }
339 
340   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx512skx_mul32);
341   BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx512skx_mul32);
342 
343   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul16_vpmovsx);
344   BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul16_vpmovsx);
345   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul16_vpunpck);
346   BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul16_vpunpck);
347   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck);
348   BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck);
349   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx2_mul32);
350   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx2_mul32);
351   BENCHMARK_QS8_END2END(qs8_dwconv_up32x9__avx2_mul32);
352 
353   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__xop_mul16_add16);
354   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__xop_mul16_add16);
355   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__xop_mul32);
356   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__xop_mul32);
357 
358   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx_mul16);
359   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx_mul16);
360   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx_mul16_add16);
361   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx_mul16_add16);
362   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__avx_mul32);
363   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__avx_mul32);
364 
365   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse41_mul16);
366   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse41_mul16);
367   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse41_mul16_add16);
368   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse41_mul16_add16);
369   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse41_mul32);
370   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse41_mul32);
371 
372   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse2_mul16);
373   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse2_mul16);
374   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__sse2_mul16_add16);
375   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__sse2_mul16_add16);
376 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
377 
378 
379 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State & state,models::ExecutionPlanFactory model)380   static void qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
381     DWConvEnd2EndBenchmark(state, model,
382       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16,
383       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
384       8 /* channel tile */, 9 /* primary tile */);
385   }
qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State & state,models::ExecutionPlanFactory model)386   static void qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) {
387     DWConvEnd2EndBenchmark(state, model,
388       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16,
389       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
390       16 /* channel tile */, 9 /* primary tile */);
391   }
392 
qs8_dwconv_up8x9__wasmsimd_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)393   static void qs8_dwconv_up8x9__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
394     DWConvEnd2EndBenchmark(state, model,
395       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16_add16,
396       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
397       8 /* channel tile */, 9 /* primary tile */);
398   }
qs8_dwconv_up16x9__wasmsimd_mul16_add16(benchmark::State & state,models::ExecutionPlanFactory model)399   static void qs8_dwconv_up16x9__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) {
400     DWConvEnd2EndBenchmark(state, model,
401       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16,
402       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
403       16 /* channel tile */, 9 /* primary tile */);
404   }
405 
406   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__wasmsimd_mul16);
407   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__wasmsimd_mul16);
408 
409   BENCHMARK_QS8_END2END(qs8_dwconv_up8x9__wasmsimd_mul16_add16);
410   BENCHMARK_QS8_END2END(qs8_dwconv_up16x9__wasmsimd_mul16_add16);
411 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
412 
413 
414 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up1x9__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)415   static void qs8_dwconv_up1x9__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
416     DWConvEnd2EndBenchmark(state, model,
417       xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__wasm_fmagic,
418       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
419       1 /* channel tile */, 9 /* primary tile */);
420   }
qs8_dwconv_up2x9__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)421   static void qs8_dwconv_up2x9__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
422     DWConvEnd2EndBenchmark(state, model,
423       xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic,
424       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
425       2 /* channel tile */, 9 /* primary tile */);
426   }
qs8_dwconv_up4x9__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)427   static void qs8_dwconv_up4x9__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
428     DWConvEnd2EndBenchmark(state, model,
429       xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__wasm_fmagic,
430       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
431       4 /* channel tile */, 9 /* primary tile */);
432   }
433 
434   BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__wasm_fmagic);
435   BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__wasm_fmagic);
436   BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__wasm_fmagic);
437 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
438 
439 
qs8_dwconv_up1x9__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)440 static void qs8_dwconv_up1x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
441   DWConvEnd2EndBenchmark(state, model,
442     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic,
443     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
444     1 /* channel tile */, 9 /* primary tile */);
445 }
qs8_dwconv_up2x9__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)446 static void qs8_dwconv_up2x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
447   DWConvEnd2EndBenchmark(state, model,
448     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_fmagic,
449     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
450     2 /* channel tile */, 9 /* primary tile */);
451 }
qs8_dwconv_up4x9__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)452 static void qs8_dwconv_up4x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
453   DWConvEnd2EndBenchmark(state, model,
454     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_fmagic,
455     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
456     4 /* channel tile */, 9 /* primary tile */);
457 }
458 
qs8_dwconv_up1x9__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)459 static void qs8_dwconv_up1x9__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
460   DWConvEnd2EndBenchmark(state, model,
461     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_imagic,
462     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
463     1 /* channel tile */, 9 /* primary tile */);
464 }
qs8_dwconv_up2x9__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)465 static void qs8_dwconv_up2x9__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
466   DWConvEnd2EndBenchmark(state, model,
467     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic,
468     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
469     2 /* channel tile */, 9 /* primary tile */);
470 }
qs8_dwconv_up4x9__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)471 static void qs8_dwconv_up4x9__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
472   DWConvEnd2EndBenchmark(state, model,
473     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_imagic,
474     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
475     4 /* channel tile */, 9 /* primary tile */);
476 }
477 
qs8_dwconv_up1x9__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)478 static void qs8_dwconv_up1x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
479   DWConvEnd2EndBenchmark(state, model,
480     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrintf,
481     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
482     1 /* channel tile */, 9 /* primary tile */);
483 }
qs8_dwconv_up2x9__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)484 static void qs8_dwconv_up2x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
485   DWConvEnd2EndBenchmark(state, model,
486     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf,
487     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
488     2 /* channel tile */, 9 /* primary tile */);
489 }
qs8_dwconv_up4x9__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)490 static void qs8_dwconv_up4x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
491   DWConvEnd2EndBenchmark(state, model,
492     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrintf,
493     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
494     4 /* channel tile */, 9 /* primary tile */);
495 }
496 
497 BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_fmagic);
498 BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_fmagic);
499 BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_fmagic);
500 
501 BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_imagic);
502 BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_imagic);
503 BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_imagic);
504 
505 BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_lrintf);
506 BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_lrintf);
507 BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_lrintf);
508 
509 
510 #ifndef XNNPACK_BENCHMARK_NO_MAIN
511 BENCHMARK_MAIN();
512 #endif
513