• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <benchmark/benchmark.h>
14 #include "bench/dwconv.h"
15 #include "bench/utils.h"
16 #include <xnnpack/AlignedAllocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/dwconv.h>
19 #include <xnnpack/indirection.h>
20 #include <xnnpack/operator.h>
21 #include <xnnpack/pack.h>
22 #include <xnnpack/params-init.h>
23 #include <xnnpack/params.h>
24 
25 
DWConvBenchmark(benchmark::State & state,xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,xnn_init_qs8_conv_minmax_params_fn init_params,uint32_t channel_tile,uint32_t primary_tile,benchmark::utils::IsaCheckFunction isa_check=nullptr)26 static void DWConvBenchmark(benchmark::State& state,
27   xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,
28   xnn_init_qs8_conv_minmax_params_fn init_params,
29   uint32_t channel_tile, uint32_t primary_tile,
30   benchmark::utils::IsaCheckFunction isa_check = nullptr)
31 {
32   if (isa_check && !isa_check(state)) {
33     return;
34   }
35 
36   const size_t input_height = state.range(0);
37   const size_t input_width = state.range(1);
38   const size_t kernel_height = state.range(2);
39   const size_t kernel_width = state.range(3);
40   const size_t padding_height = state.range(4);
41   const size_t padding_width = state.range(5);
42   const size_t subsampling = state.range(6);
43   const size_t dilation = state.range(7);
44   const size_t channels = state.range(8);
45 
46   const size_t kernel_size = kernel_height * kernel_width;
47   if (kernel_size != primary_tile) {
48     state.SkipWithError("kernel size mismatch");
49     return;
50   }
51 
52   std::random_device random_device;
53   auto rng = std::mt19937(random_device());
54   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
55   auto i8rng = std::bind(
56     std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
57 
58   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
59   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
60   const size_t padding_left = padding_width / 2;
61   const size_t padding_top = padding_height / 2;
62   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
63   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
64   const size_t output_size = output_height * output_width;
65   const size_t step_width = dilation == 1 ? subsampling : kernel_width;
66   const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
67 
68   const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
69 
70   std::vector<int8_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(int8_t));
71   std::generate(a.begin(), a.end(), std::ref(i8rng));
72   std::vector<int8_t> k(channels * kernel_height * kernel_width);
73   std::generate(k.begin(), k.end(), std::ref(i8rng));
74   std::vector<int32_t> b(channels);
75   std::generate(b.begin(), b.end(), std::ref(i32rng));
76 
77   std::vector<int8_t> z(channels + XNN_EXTRA_BYTES / sizeof(int8_t));
78 
79   const size_t k_elements = kernel_size * c_stride;
80   const size_t b_elements = c_stride;
81   const size_t w_size = k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t);
82   const size_t i_elements = output_height * step_height;
83   const size_t c_elements = output_size * channels;
84   const size_t num_buffers = 1 +
85     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
86       (c_elements * sizeof(int8_t) + w_size) + sizeof(void*) * i_elements);
87 
88   std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
89   std::fill(w.begin(), w.end(), 0.0f);
90   struct xnn_qs8_packing_params packing_params;
91   packing_params.input_zero_point = 0;
92   xnn_pack_qs8_dwconv_ghw_w(kernel_height, kernel_width, channels, channel_tile,
93       k.data(), b.data(), w.data(), 0 /* extra bytes */, &packing_params);
94   for (size_t n = 1; n < num_buffers; n++) {
95     std::copy(w.cbegin(), w.cbegin() + w_size, w.begin() + n * w_size);
96   }
97 
98   std::vector<const int8_t*> i(i_elements * num_buffers);
99   xnn_operator convolution_op = { };
100   convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
101   convolution_op.input              = a.data();
102   convolution_op.input_pixel_stride = channels;
103   convolution_op.zero_buffer        = z.data();
104   convolution_op.input_height       = input_height;
105   convolution_op.input_width        = input_width;
106   convolution_op.output_height      = output_height;
107   convolution_op.output_width       = output_width;
108   convolution_op.kernel_height      = kernel_height;
109   convolution_op.kernel_width       = kernel_width;
110   convolution_op.stride_height      = subsampling;
111   convolution_op.stride_width       = subsampling;
112   convolution_op.dilation_height    = dilation;
113   convolution_op.dilation_width     = dilation;
114   convolution_op.padding_top        = padding_top;
115   convolution_op.padding_left       = padding_left;
116 
117   xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, 0 /* log2(sizeof(int8_t)) */);
118   for (size_t n = 1; n < num_buffers; n++) {
119     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
120   }
121 
122   std::vector<int8_t> c(c_elements * num_buffers);
123   std::fill(c.begin(), c.end(), std::nanf(""));
124 
125   xnn_qs8_conv_minmax_params params;
126   init_params(&params,
127     0.5f /* scale */, 0 /* output zero point */, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
128 
129   size_t buffer_index = 0;
130   for (auto _ : state) {
131     state.PauseTiming();
132     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
133     buffer_index = (buffer_index + 1) % num_buffers;
134     state.ResumeTiming();
135 
136     for (size_t y = 0; y < output_height; y++) {
137       dwconv(channels, output_width,
138         i.data() + buffer_index * i_elements + step_height * y,
139         w.data() + buffer_index * w_size,
140         c.data() + buffer_index * c_elements + y * output_width * channels,
141         kernel_height * step_width * sizeof(void*), 0,
142         0, z.data(), &params);
143     }
144   }
145 
146   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
147   if (cpu_frequency != 0) {
148     state.counters["cpufreq"] = cpu_frequency;
149   }
150 
151   state.counters["FLOPS"] = benchmark::Counter(
152     uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
153     benchmark::Counter::kIsRate);
154 
155   state.counters["bytes"] = benchmark::Counter(
156     uint64_t(state.iterations()) * channels * ((output_size + input_height * input_width + kernel_size) * sizeof(int8_t) + sizeof(int32_t)),
157     benchmark::Counter::kIsRate);
158 }
159 
160 
161 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State & state,const char * net)162   static void qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
163     DWConvBenchmark(state,
164       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul8_ld64,
165       xnn_init_qs8_conv_minmax_rndnu_neon_params,
166       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
167   }
qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State & state,const char * net)168   static void qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
169     DWConvBenchmark(state,
170       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld64,
171       xnn_init_qs8_conv_minmax_rndnu_neon_params,
172       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
173   }
qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State & state,const char * net)174   static void qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State& state, const char* net) {
175     DWConvBenchmark(state,
176       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld128,
177       xnn_init_qs8_conv_minmax_rndnu_neon_params,
178       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
179   }
qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State & state,const char * net)180   static void qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
181     DWConvBenchmark(state,
182       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mla8_ld64,
183       xnn_init_qs8_conv_minmax_rndnu_neon_params,
184       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
185   }
qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State & state,const char * net)186   static void qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
187     DWConvBenchmark(state,
188       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64,
189       xnn_init_qs8_conv_minmax_rndnu_neon_params,
190       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
191   }
qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State & state,const char * net)192   static void qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State& state, const char* net) {
193     DWConvBenchmark(state,
194       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld128,
195       xnn_init_qs8_conv_minmax_rndnu_neon_params,
196       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
197   }
qs8_dwconv_up8x9__neon_mul16(benchmark::State & state,const char * net)198   static void qs8_dwconv_up8x9__neon_mul16(benchmark::State& state, const char* net) {
199     DWConvBenchmark(state,
200       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16,
201       xnn_init_qs8_conv_minmax_rndnu_neon_params,
202       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
203   }
qs8_dwconv_up16x9__neon_mul16(benchmark::State & state,const char * net)204   static void qs8_dwconv_up16x9__neon_mul16(benchmark::State& state, const char* net) {
205     DWConvBenchmark(state,
206       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul16,
207       xnn_init_qs8_conv_minmax_rndnu_neon_params,
208       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
209   }
qs8_dwconv_up24x9__neon_mul16(benchmark::State & state,const char * net)210   static void qs8_dwconv_up24x9__neon_mul16(benchmark::State& state, const char* net) {
211     DWConvBenchmark(state,
212       xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x9__neon_mul16,
213       xnn_init_qs8_conv_minmax_rndnu_neon_params,
214       24 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
215   }
qs8_dwconv_up32x9__neon_mul16(benchmark::State & state,const char * net)216   static void qs8_dwconv_up32x9__neon_mul16(benchmark::State& state, const char* net) {
217     DWConvBenchmark(state,
218       xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x9__neon_mul16,
219       xnn_init_qs8_conv_minmax_rndnu_neon_params,
220       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
221   }
qs8_dwconv_up8x25__neon_mul8_ld64(benchmark::State & state,const char * net)222   static void qs8_dwconv_up8x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
223     DWConvBenchmark(state,
224       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8_ld64,
225       xnn_init_qs8_conv_minmax_rndnu_neon_params,
226       8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
227   }
qs8_dwconv_up16x25__neon_mul8_ld64(benchmark::State & state,const char * net)228   static void qs8_dwconv_up16x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
229     DWConvBenchmark(state,
230       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld64,
231       xnn_init_qs8_conv_minmax_rndnu_neon_params,
232       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
233   }
qs8_dwconv_up16x25__neon_mul8_ld128(benchmark::State & state,const char * net)234   static void qs8_dwconv_up16x25__neon_mul8_ld128(benchmark::State& state, const char* net) {
235     DWConvBenchmark(state,
236       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld128,
237       xnn_init_qs8_conv_minmax_rndnu_neon_params,
238       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
239   }
qs8_dwconv_up8x25__neon_mla8_ld64(benchmark::State & state,const char * net)240   static void qs8_dwconv_up8x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
241     DWConvBenchmark(state,
242       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64,
243       xnn_init_qs8_conv_minmax_rndnu_neon_params,
244       8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
245   }
qs8_dwconv_up16x25__neon_mla8_ld64(benchmark::State & state,const char * net)246   static void qs8_dwconv_up16x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
247     DWConvBenchmark(state,
248       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64,
249       xnn_init_qs8_conv_minmax_rndnu_neon_params,
250       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
251   }
qs8_dwconv_up16x25__neon_mla8_ld128(benchmark::State & state,const char * net)252   static void qs8_dwconv_up16x25__neon_mla8_ld128(benchmark::State& state, const char* net) {
253     DWConvBenchmark(state,
254       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld128,
255       xnn_init_qs8_conv_minmax_rndnu_neon_params,
256       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
257   }
qs8_dwconv_up8x25__neon_mul16(benchmark::State & state,const char * net)258   static void qs8_dwconv_up8x25__neon_mul16(benchmark::State& state, const char* net) {
259     DWConvBenchmark(state,
260       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16,
261       xnn_init_qs8_conv_minmax_rndnu_neon_params,
262       8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
263   }
qs8_dwconv_up16x25__neon_mul16(benchmark::State & state,const char * net)264   static void qs8_dwconv_up16x25__neon_mul16(benchmark::State& state, const char* net) {
265     DWConvBenchmark(state,
266       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul16,
267       xnn_init_qs8_conv_minmax_rndnu_neon_params,
268       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
269   }
qs8_dwconv_up24x25__neon_mul16(benchmark::State & state,const char * net)270   static void qs8_dwconv_up24x25__neon_mul16(benchmark::State& state, const char* net) {
271     DWConvBenchmark(state,
272       xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul16,
273       xnn_init_qs8_conv_minmax_rndnu_neon_params,
274       24 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
275   }
qs8_dwconv_up32x25__neon_mul16(benchmark::State & state,const char * net)276   static void qs8_dwconv_up32x25__neon_mul16(benchmark::State& state, const char* net) {
277     DWConvBenchmark(state,
278       xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x25__neon_mul16,
279       xnn_init_qs8_conv_minmax_rndnu_neon_params,
280       32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
281   }
282 
283   BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul8_ld64);
284   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld64);
285   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld128);
286   BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mla8_ld64);
287   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld64);
288   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld128);
289   BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul16);
290   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul16);
291   BENCHMARK_DWCONV(qs8_dwconv_up24x9__neon_mul16);
292   BENCHMARK_DWCONV(qs8_dwconv_up32x9__neon_mul16);
293   BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul8_ld64);
294   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld64);
295   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld128);
296   BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mla8_ld64);
297   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld64);
298   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld128);
299   BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul16);
300   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul16);
301   BENCHMARK_DWCONV(qs8_dwconv_up24x25__neon_mul16);
302   BENCHMARK_DWCONV(qs8_dwconv_up32x25__neon_mul16);
303 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
304 
305 
306 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State & state,const char * net)307   static void qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State& state, const char* net) {
308     DWConvBenchmark(state,
309       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx512skx_mul32,
310       xnn_init_qs8_conv_minmax_fp32_avx512_params,
311       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
312   }
qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State & state,const char * net)313   static void qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State& state, const char* net) {
314     DWConvBenchmark(state,
315       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32,
316       xnn_init_qs8_conv_minmax_fp32_avx512_params,
317       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
318   }
qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State & state,const char * net)319   static void qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
320     DWConvBenchmark(state,
321       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpmovsx,
322       xnn_init_qs8_conv_minmax_fp32_avx2_params,
323       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
324   }
qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State & state,const char * net)325   static void qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
326     DWConvBenchmark(state,
327       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpmovsx,
328       xnn_init_qs8_conv_minmax_fp32_avx2_params,
329       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
330   }
qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State & state,const char * net)331   static void qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
332     DWConvBenchmark(state,
333       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpunpck,
334       xnn_init_qs8_conv_minmax_fp32_avx2_params,
335       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
336   }
qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State & state,const char * net)337   static void qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
338     DWConvBenchmark(state,
339       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpunpck,
340       xnn_init_qs8_conv_minmax_fp32_avx2_params,
341       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
342   }
qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State & state,const char * net)343   static void qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
344     DWConvBenchmark(state,
345       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_add16_vpunpck,
346       xnn_init_qs8_conv_minmax_fp32_avx2_params,
347       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
348   }
qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State & state,const char * net)349   static void qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
350     DWConvBenchmark(state,
351       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck,
352       xnn_init_qs8_conv_minmax_fp32_avx2_params,
353       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
354   }
qs8_dwconv_up8x9__avx2_mul32(benchmark::State & state,const char * net)355   static void qs8_dwconv_up8x9__avx2_mul32(benchmark::State& state, const char* net) {
356     DWConvBenchmark(state,
357       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx2_mul32,
358       xnn_init_qs8_conv_minmax_fp32_avx2_params,
359       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
360   }
qs8_dwconv_up16x9__avx2_mul32(benchmark::State & state,const char * net)361   static void qs8_dwconv_up16x9__avx2_mul32(benchmark::State& state, const char* net) {
362     DWConvBenchmark(state,
363       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32,
364       xnn_init_qs8_conv_minmax_fp32_avx2_params,
365       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
366   }
qs8_dwconv_up32x9__avx2_mul32(benchmark::State & state,const char * net)367   static void qs8_dwconv_up32x9__avx2_mul32(benchmark::State& state, const char* net) {
368     DWConvBenchmark(state,
369       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul32,
370       xnn_init_qs8_conv_minmax_fp32_avx2_params,
371       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
372   }
qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State & state,const char * net)373   static void qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State& state, const char* net) {
374     DWConvBenchmark(state,
375       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16,
376       xnn_init_qs8_conv_minmax_fp32_sse4_params,
377       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
378   }
qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State & state,const char * net)379   static void qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State& state, const char* net) {
380     DWConvBenchmark(state,
381       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16,
382       xnn_init_qs8_conv_minmax_fp32_sse4_params,
383       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
384   }
qs8_dwconv_up8x9__avx_mul16(benchmark::State & state,const char * net)385   static void qs8_dwconv_up8x9__avx_mul16(benchmark::State& state, const char* net) {
386     DWConvBenchmark(state,
387       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16,
388       xnn_init_qs8_conv_minmax_fp32_sse4_params,
389       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
390   }
qs8_dwconv_up16x9__avx_mul16(benchmark::State & state,const char * net)391   static void qs8_dwconv_up16x9__avx_mul16(benchmark::State& state, const char* net) {
392     DWConvBenchmark(state,
393       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16,
394       xnn_init_qs8_conv_minmax_fp32_sse4_params,
395       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
396   }
qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State & state,const char * net)397   static void qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State& state, const char* net) {
398     DWConvBenchmark(state,
399       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16,
400       xnn_init_qs8_conv_minmax_fp32_sse4_params,
401       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
402   }
qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State & state,const char * net)403   static void qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State& state, const char* net) {
404     DWConvBenchmark(state,
405       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16,
406       xnn_init_qs8_conv_minmax_fp32_sse4_params,
407       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
408   }
qs8_dwconv_up8x9__avx_mul32(benchmark::State & state,const char * net)409   static void qs8_dwconv_up8x9__avx_mul32(benchmark::State& state, const char* net) {
410     DWConvBenchmark(state,
411       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32,
412       xnn_init_qs8_conv_minmax_fp32_sse4_params,
413       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
414   }
qs8_dwconv_up16x9__avx_mul32(benchmark::State & state,const char * net)415   static void qs8_dwconv_up16x9__avx_mul32(benchmark::State& state, const char* net) {
416     DWConvBenchmark(state,
417       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32,
418       xnn_init_qs8_conv_minmax_fp32_sse4_params,
419       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
420   }
qs8_dwconv_up8x9__sse41_mul16(benchmark::State & state,const char * net)421   static void qs8_dwconv_up8x9__sse41_mul16(benchmark::State& state, const char* net) {
422     DWConvBenchmark(state,
423       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16,
424       xnn_init_qs8_conv_minmax_fp32_sse4_params,
425       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
426   }
qs8_dwconv_up16x9__sse41_mul16(benchmark::State & state,const char * net)427   static void qs8_dwconv_up16x9__sse41_mul16(benchmark::State& state, const char* net) {
428     DWConvBenchmark(state,
429       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16,
430       xnn_init_qs8_conv_minmax_fp32_sse4_params,
431       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
432   }
qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State & state,const char * net)433   static void qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
434     DWConvBenchmark(state,
435       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16,
436       xnn_init_qs8_conv_minmax_fp32_sse4_params,
437       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
438   }
qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State & state,const char * net)439   static void qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
440     DWConvBenchmark(state,
441       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16,
442       xnn_init_qs8_conv_minmax_fp32_sse4_params,
443       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
444   }
qs8_dwconv_up8x9__sse41_mul32(benchmark::State & state,const char * net)445   static void qs8_dwconv_up8x9__sse41_mul32(benchmark::State& state, const char* net) {
446     DWConvBenchmark(state,
447       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32,
448       xnn_init_qs8_conv_minmax_fp32_sse4_params,
449       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
450   }
qs8_dwconv_up16x9__sse41_mul32(benchmark::State & state,const char * net)451   static void qs8_dwconv_up16x9__sse41_mul32(benchmark::State& state, const char* net) {
452     DWConvBenchmark(state,
453       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32,
454       xnn_init_qs8_conv_minmax_fp32_sse4_params,
455       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
456   }
qs8_dwconv_up8x9__sse2_mul16(benchmark::State & state,const char * net)457   static void qs8_dwconv_up8x9__sse2_mul16(benchmark::State& state, const char* net) {
458     DWConvBenchmark(state,
459       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16,
460       xnn_init_qs8_conv_minmax_fp32_sse2_params,
461       8 /* channel tile */, 9 /* primary tile */);
462   }
qs8_dwconv_up16x9__sse2_mul16(benchmark::State & state,const char * net)463   static void qs8_dwconv_up16x9__sse2_mul16(benchmark::State& state, const char* net) {
464     DWConvBenchmark(state,
465       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16,
466       xnn_init_qs8_conv_minmax_fp32_sse2_params,
467       16 /* channel tile */, 9 /* primary tile */);
468   }
qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State & state,const char * net)469   static void qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
470     DWConvBenchmark(state,
471       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16,
472       xnn_init_qs8_conv_minmax_fp32_sse2_params,
473       8 /* channel tile */, 9 /* primary tile */);
474   }
qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State & state,const char * net)475   static void qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
476     DWConvBenchmark(state,
477       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16,
478       xnn_init_qs8_conv_minmax_fp32_sse2_params,
479       16 /* channel tile */, 9 /* primary tile */);
480   }
481 
482   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx512skx_mul32);
483   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx512skx_mul32);
484 
485   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpmovsx);
486   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpmovsx);
487   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpunpck);
488   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpunpck);
489   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck);
490   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck);
491   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx2_mul32);
492   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul32);
493   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul32);
494 
495   BENCHMARK_DWCONV(qs8_dwconv_up8x9__xop_mul16_add16);
496   BENCHMARK_DWCONV(qs8_dwconv_up16x9__xop_mul16_add16);
497 
498   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16);
499   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16);
500   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16_add16);
501   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16_add16);
502   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul32);
503   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul32);
504 
505   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16);
506   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16);
507   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16_add16);
508   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16_add16);
509   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul32);
510   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul32);
511 
512   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16);
513   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16);
514   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16_add16);
515   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16_add16);
516 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
517 
518 
519 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State & state,const char * net)520   static void qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
521     DWConvBenchmark(state,
522       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16,
523       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
524       8 /* channel tile */, 9 /* primary tile */);
525   }
qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State & state,const char * net)526   static void qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
527     DWConvBenchmark(state,
528       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16,
529       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
530       16 /* channel tile */, 9 /* primary tile */);
531   }
532 
533   BENCHMARK_DWCONV(qs8_dwconv_up8x9__wasmsimd_mul16);
534   BENCHMARK_DWCONV(qs8_dwconv_up16x9__wasmsimd_mul16);
535 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
536 
537 
538 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up1x9__wasm_fmagic(benchmark::State & state,const char * net)539   static void qs8_dwconv_up1x9__wasm_fmagic(benchmark::State& state, const char* net) {
540     DWConvBenchmark(state,
541       xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__wasm_fmagic,
542       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
543       1 /* channel tile */, 9 /* primary tile */);
544   }
qs8_dwconv_up2x9__wasm_fmagic(benchmark::State & state,const char * net)545   static void qs8_dwconv_up2x9__wasm_fmagic(benchmark::State& state, const char* net) {
546     DWConvBenchmark(state,
547       xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic,
548       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
549       2 /* channel tile */, 9 /* primary tile */);
550   }
qs8_dwconv_up4x9__wasm_fmagic(benchmark::State & state,const char * net)551   static void qs8_dwconv_up4x9__wasm_fmagic(benchmark::State& state, const char* net) {
552     DWConvBenchmark(state,
553       xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__wasm_fmagic,
554       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
555       4 /* channel tile */, 9 /* primary tile */);
556   }
557 
558   BENCHMARK_DWCONV(qs8_dwconv_up1x9__wasm_fmagic);
559   BENCHMARK_DWCONV(qs8_dwconv_up2x9__wasm_fmagic);
560   BENCHMARK_DWCONV(qs8_dwconv_up4x9__wasm_fmagic);
561 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
562 
563 
qs8_dwconv_up1x9__scalar_fmagic(benchmark::State & state,const char * net)564 static void qs8_dwconv_up1x9__scalar_fmagic(benchmark::State& state, const char* net) {
565   DWConvBenchmark(state,
566     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic,
567     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
568     1 /* channel tile */, 9 /* primary tile */);
569 }
qs8_dwconv_up2x9__scalar_fmagic(benchmark::State & state,const char * net)570 static void qs8_dwconv_up2x9__scalar_fmagic(benchmark::State& state, const char* net) {
571   DWConvBenchmark(state,
572     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_fmagic,
573     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
574     2 /* channel tile */, 9 /* primary tile */);
575 }
qs8_dwconv_up4x9__scalar_fmagic(benchmark::State & state,const char * net)576 static void qs8_dwconv_up4x9__scalar_fmagic(benchmark::State& state, const char* net) {
577   DWConvBenchmark(state,
578     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_fmagic,
579     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
580     4 /* channel tile */, 9 /* primary tile */);
581 }
582 
qs8_dwconv_up1x9__scalar_imagic(benchmark::State & state,const char * net)583 static void qs8_dwconv_up1x9__scalar_imagic(benchmark::State& state, const char* net) {
584   DWConvBenchmark(state,
585     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_imagic,
586     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
587     1 /* channel tile */, 9 /* primary tile */);
588 }
qs8_dwconv_up2x9__scalar_imagic(benchmark::State & state,const char * net)589 static void qs8_dwconv_up2x9__scalar_imagic(benchmark::State& state, const char* net) {
590   DWConvBenchmark(state,
591     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic,
592     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
593     2 /* channel tile */, 9 /* primary tile */);
594 }
qs8_dwconv_up4x9__scalar_imagic(benchmark::State & state,const char * net)595 static void qs8_dwconv_up4x9__scalar_imagic(benchmark::State& state, const char* net) {
596   DWConvBenchmark(state,
597     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_imagic,
598     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
599     4 /* channel tile */, 9 /* primary tile */);
600 }
601 
qs8_dwconv_up1x9__scalar_lrintf(benchmark::State & state,const char * net)602 static void qs8_dwconv_up1x9__scalar_lrintf(benchmark::State& state, const char* net) {
603   DWConvBenchmark(state,
604     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrintf,
605     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
606     1 /* channel tile */, 9 /* primary tile */);
607 }
qs8_dwconv_up2x9__scalar_lrintf(benchmark::State & state,const char * net)608 static void qs8_dwconv_up2x9__scalar_lrintf(benchmark::State& state, const char* net) {
609   DWConvBenchmark(state,
610     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf,
611     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
612     2 /* channel tile */, 9 /* primary tile */);
613 }
qs8_dwconv_up4x9__scalar_lrintf(benchmark::State & state,const char * net)614 static void qs8_dwconv_up4x9__scalar_lrintf(benchmark::State& state, const char* net) {
615   DWConvBenchmark(state,
616     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrintf,
617     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
618     4 /* channel tile */, 9 /* primary tile */);
619 }
620 
621 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_fmagic);
622 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_fmagic);
623 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_fmagic);
624 
625 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_imagic);
626 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_imagic);
627 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_imagic);
628 
629 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_lrintf);
630 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_lrintf);
631 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_lrintf);
632 
633 
634 #ifndef XNNPACK_BENCHMARK_NO_MAIN
635 BENCHMARK_MAIN();
636 #endif
637