• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <benchmark/benchmark.h>
14 #include "bench/dwconv.h"
15 #include "bench/utils.h"
16 #include <xnnpack/AlignedAllocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/dwconv.h>
19 #include <xnnpack/indirection.h>
20 #include <xnnpack/operator.h>
21 #include <xnnpack/pack.h>
22 #include <xnnpack/params-init.h>
23 #include <xnnpack/params.h>
24 
25 
DWConvBenchmark(benchmark::State & state,xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv,xnn_init_f32_minmax_params_fn init_params,uint32_t channel_tile,uint32_t primary_tile,benchmark::utils::IsaCheckFunction isa_check=nullptr)26 static void DWConvBenchmark(benchmark::State& state,
27   xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv,
28   xnn_init_f32_minmax_params_fn init_params,
29   uint32_t channel_tile, uint32_t primary_tile,
30   benchmark::utils::IsaCheckFunction isa_check = nullptr)
31 {
32   if (isa_check && !isa_check(state)) {
33     return;
34   }
35 
36   const size_t input_height = state.range(0);
37   const size_t input_width = state.range(1);
38   const size_t kernel_height = state.range(2);
39   const size_t kernel_width = state.range(3);
40   const size_t padding_height = state.range(4);
41   const size_t padding_width = state.range(5);
42   const size_t subsampling = state.range(6);
43   const size_t dilation = state.range(7);
44   const size_t channels = state.range(8);
45 
46   const size_t kernel_size = kernel_height * kernel_width;
47   if (kernel_size != primary_tile) {
48     state.SkipWithError("kernel size mismatch");
49     return;
50   }
51 
52   std::random_device random_device;
53   auto rng = std::mt19937(random_device());
54   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
55 
56   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
57   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
58   const size_t padding_left = padding_width / 2;
59   const size_t padding_top = padding_height / 2;
60   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
61   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
62   const size_t output_size = output_height * output_width;
63   const size_t step_width = dilation == 1 ? subsampling : kernel_width;
64   const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
65 
66   const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
67 
68   std::vector<float> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(float));
69   std::generate(a.begin(), a.end(), std::ref(f32rng));
70   std::vector<float> k(channels * kernel_height * kernel_width);
71   std::generate(k.begin(), k.end(), std::ref(f32rng));
72   std::vector<float> b(channels);
73   std::generate(b.begin(), b.end(), std::ref(f32rng));
74 
75   std::vector<float> z(channels + XNN_EXTRA_BYTES / sizeof(float));
76 
77   const size_t w_elements = (kernel_size + 1) * c_stride;
78   const size_t i_elements = output_height * step_height;
79   const size_t c_elements = output_size * channels;
80   const size_t num_buffers = 1 +
81     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
82       sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
83 
84   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
85   std::fill(w.begin(), w.end(), 0.0f);
86   xnn_pack_f32_dwconv_ghw_w(kernel_height, kernel_width, channels, channel_tile,
87       k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
88   for (size_t n = 1; n < num_buffers; n++) {
89     std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
90   }
91 
92   std::vector<const float*> i(i_elements * num_buffers);
93   xnn_operator convolution_op = { };
94   convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
95   convolution_op.input              = a.data();
96   convolution_op.input_pixel_stride = channels;
97   convolution_op.zero_buffer        = z.data();
98   convolution_op.input_height       = input_height;
99   convolution_op.input_width        = input_width;
100   convolution_op.output_height      = output_height;
101   convolution_op.output_width       = output_width;
102   convolution_op.kernel_height      = kernel_height;
103   convolution_op.kernel_width       = kernel_width;
104   convolution_op.stride_height      = subsampling;
105   convolution_op.stride_width       = subsampling;
106   convolution_op.dilation_height    = dilation;
107   convolution_op.dilation_width     = dilation;
108   convolution_op.padding_top        = padding_top;
109   convolution_op.padding_left       = padding_left;
110 
111   xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, 2 /* log2(sizeof(float)) */);
112   for (size_t n = 1; n < num_buffers; n++) {
113     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
114   }
115 
116   std::vector<float> c(c_elements * num_buffers);
117   std::fill(c.begin(), c.end(), std::nanf(""));
118 
119   xnn_f32_minmax_params params;
120   init_params(&params, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
121 
122   size_t buffer_index = 0;
123   for (auto _ : state) {
124     state.PauseTiming();
125     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
126     buffer_index = (buffer_index + 1) % num_buffers;
127     state.ResumeTiming();
128 
129     for (size_t y = 0; y < output_height; y++) {
130       dwconv(channels, output_width,
131         i.data() + buffer_index * i_elements + step_height * y,
132         w.data() + buffer_index * w_elements,
133         c.data() + buffer_index * c_elements + y * output_width * channels,
134         kernel_height * step_width * sizeof(void*), 0,
135         0, z.data(), &params);
136     }
137   }
138 
139   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
140   if (cpu_frequency != 0) {
141     state.counters["cpufreq"] = cpu_frequency;
142   }
143 
144   state.counters["FLOPS"] = benchmark::Counter(
145     uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
146     benchmark::Counter::kIsRate);
147 
148   state.counters["bytes"] = benchmark::Counter(
149     uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(float),
150     benchmark::Counter::kIsRate);
151 }
152 
153 
154 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_dwconv_4x9__aarch64_neonfma(benchmark::State & state,const char * net)155   static void f32_dwconv_4x9__aarch64_neonfma(benchmark::State& state, const char* net) {
156     DWConvBenchmark(state,
157       xnn_f32_dwconv_minmax_ukernel_up4x9__neon,
158       xnn_init_f32_minmax_scalar_params,
159       4 /* channel tile */, 9 /* primary tile */);
160   }
f32_dwconv_4x9__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)161   static void f32_dwconv_4x9__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
162     DWConvBenchmark(state,
163       xnn_f32_dwconv_minmax_ukernel_up4x9__neonfma,
164       xnn_init_f32_minmax_scalar_params,
165       4 /* channel tile */, 9 /* primary tile */);
166   }
167 
168   BENCHMARK_DWCONV(f32_dwconv_4x9__aarch64_neonfma)
BENCHMARK_DWCONV(f32_dwconv_4x9__aarch64_neonfma_cortex_a55)169   BENCHMARK_DWCONV(f32_dwconv_4x9__aarch64_neonfma_cortex_a55)
170 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
171 
172 
173 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
174   static void f32_dwconv_4x4__neon_acc2(benchmark::State& state, const char* net) {
175     DWConvBenchmark(state,
176       xnn_f32_dwconv_minmax_ukernel_up4x4__neon_acc2,
177       xnn_init_f32_minmax_scalar_params,
178       4 /* channel tile */, 4 /* primary tile */,
179       benchmark::utils::CheckNEON);
180   }
f32_dwconv_4x4__neon(benchmark::State & state,const char * net)181   static void f32_dwconv_4x4__neon(benchmark::State& state, const char* net) {
182     DWConvBenchmark(state,
183       xnn_f32_dwconv_minmax_ukernel_up4x4__neon,
184       xnn_init_f32_minmax_scalar_params,
185       4 /* channel tile */, 4 /* primary tile */,
186       benchmark::utils::CheckNEON);
187   }
f32_dwconv_4x4__neonfma_acc2(benchmark::State & state,const char * net)188   static void f32_dwconv_4x4__neonfma_acc2(benchmark::State& state, const char* net) {
189     DWConvBenchmark(state,
190       xnn_f32_dwconv_minmax_ukernel_up4x4__neonfma_acc2,
191       xnn_init_f32_minmax_scalar_params,
192       4 /* channel tile */, 4 /* primary tile */,
193       benchmark::utils::CheckNEONFMA);
194   }
f32_dwconv_4x4__neonfma(benchmark::State & state,const char * net)195   static void f32_dwconv_4x4__neonfma(benchmark::State& state, const char* net) {
196     DWConvBenchmark(state,
197       xnn_f32_dwconv_minmax_ukernel_up4x4__neonfma,
198       xnn_init_f32_minmax_scalar_params,
199       4 /* channel tile */, 4 /* primary tile */,
200       benchmark::utils::CheckNEONFMA);
201   }
f32_dwconv_4x9__neon_acc2(benchmark::State & state,const char * net)202   static void f32_dwconv_4x9__neon_acc2(benchmark::State& state, const char* net) {
203     DWConvBenchmark(state,
204       xnn_f32_dwconv_minmax_ukernel_up4x9__neon_acc2,
205       xnn_init_f32_minmax_scalar_params,
206       4 /* channel tile */, 9 /* primary tile */,
207       benchmark::utils::CheckNEON);
208   }
f32_dwconv_4x9__neon(benchmark::State & state,const char * net)209   static void f32_dwconv_4x9__neon(benchmark::State& state, const char* net) {
210     DWConvBenchmark(state,
211       xnn_f32_dwconv_minmax_ukernel_up4x9__neon,
212       xnn_init_f32_minmax_scalar_params,
213       4 /* channel tile */, 9 /* primary tile */,
214       benchmark::utils::CheckNEON);
215   }
f32_dwconv_4x9__neonfma_acc2(benchmark::State & state,const char * net)216   static void f32_dwconv_4x9__neonfma_acc2(benchmark::State& state, const char* net) {
217     DWConvBenchmark(state,
218       xnn_f32_dwconv_minmax_ukernel_up4x9__neonfma_acc2,
219       xnn_init_f32_minmax_scalar_params,
220       4 /* channel tile */, 9 /* primary tile */,
221       benchmark::utils::CheckNEONFMA);
222   }
f32_dwconv_4x9__neonfma(benchmark::State & state,const char * net)223   static void f32_dwconv_4x9__neonfma(benchmark::State& state, const char* net) {
224     DWConvBenchmark(state,
225       xnn_f32_dwconv_minmax_ukernel_up4x9__neonfma,
226       xnn_init_f32_minmax_scalar_params,
227       4 /* channel tile */, 9 /* primary tile */,
228       benchmark::utils::CheckNEONFMA);
229   }
f32_dwconv_4x25__neon_acc2(benchmark::State & state,const char * net)230   static void f32_dwconv_4x25__neon_acc2(benchmark::State& state, const char* net) {
231     DWConvBenchmark(state,
232       xnn_f32_dwconv_minmax_ukernel_up4x25__neon_acc2,
233       xnn_init_f32_minmax_scalar_params,
234       4 /* channel tile */, 25 /* primary tile */,
235       benchmark::utils::CheckNEON);
236   }
f32_dwconv_4x25__neon(benchmark::State & state,const char * net)237   static void f32_dwconv_4x25__neon(benchmark::State& state, const char* net) {
238     DWConvBenchmark(state,
239       xnn_f32_dwconv_minmax_ukernel_up4x25__neon,
240       xnn_init_f32_minmax_scalar_params,
241       4 /* channel tile */, 25 /* primary tile */,
242       benchmark::utils::CheckNEON);
243   }
f32_dwconv_4x25__neonfma_acc2(benchmark::State & state,const char * net)244   static void f32_dwconv_4x25__neonfma_acc2(benchmark::State& state, const char* net) {
245     DWConvBenchmark(state,
246       xnn_f32_dwconv_minmax_ukernel_up4x25__neonfma_acc2,
247       xnn_init_f32_minmax_scalar_params,
248       4 /* channel tile */, 25 /* primary tile */,
249       benchmark::utils::CheckNEONFMA);
250   }
f32_dwconv_4x25__neonfma(benchmark::State & state,const char * net)251   static void f32_dwconv_4x25__neonfma(benchmark::State& state, const char* net) {
252     DWConvBenchmark(state,
253       xnn_f32_dwconv_minmax_ukernel_up4x25__neonfma,
254       xnn_init_f32_minmax_scalar_params,
255       4 /* channel tile */, 25 /* primary tile */,
256       benchmark::utils::CheckNEONFMA);
257   }
f32_dwconv_8x4__neon_acc2(benchmark::State & state,const char * net)258   static void f32_dwconv_8x4__neon_acc2(benchmark::State& state, const char* net) {
259     DWConvBenchmark(state,
260       xnn_f32_dwconv_minmax_ukernel_up8x4__neon_acc2,
261       xnn_init_f32_minmax_scalar_params,
262       8 /* channel tile */, 4 /* primary tile */,
263       benchmark::utils::CheckNEON);
264   }
f32_dwconv_8x4__neon(benchmark::State & state,const char * net)265   static void f32_dwconv_8x4__neon(benchmark::State& state, const char* net) {
266     DWConvBenchmark(state,
267       xnn_f32_dwconv_minmax_ukernel_up8x4__neon,
268       xnn_init_f32_minmax_scalar_params,
269       8 /* channel tile */, 4 /* primary tile */,
270       benchmark::utils::CheckNEON);
271   }
f32_dwconv_8x4__neonfma_acc2(benchmark::State & state,const char * net)272   static void f32_dwconv_8x4__neonfma_acc2(benchmark::State& state, const char* net) {
273     DWConvBenchmark(state,
274       xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma_acc2,
275       xnn_init_f32_minmax_scalar_params,
276       8 /* channel tile */, 4 /* primary tile */,
277       benchmark::utils::CheckNEONFMA);
278   }
f32_dwconv_8x4__neonfma(benchmark::State & state,const char * net)279   static void f32_dwconv_8x4__neonfma(benchmark::State& state, const char* net) {
280     DWConvBenchmark(state,
281       xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma,
282       xnn_init_f32_minmax_scalar_params,
283       8 /* channel tile */, 4 /* primary tile */,
284       benchmark::utils::CheckNEONFMA);
285   }
f32_dwconv_8x9__neon_acc2(benchmark::State & state,const char * net)286   static void f32_dwconv_8x9__neon_acc2(benchmark::State& state, const char* net) {
287     DWConvBenchmark(state,
288       xnn_f32_dwconv_minmax_ukernel_up8x9__neon_acc2,
289       xnn_init_f32_minmax_scalar_params,
290       8 /* channel tile */, 9 /* primary tile */,
291       benchmark::utils::CheckNEON);
292   }
f32_dwconv_8x9__neon(benchmark::State & state,const char * net)293   static void f32_dwconv_8x9__neon(benchmark::State& state, const char* net) {
294     DWConvBenchmark(state,
295       xnn_f32_dwconv_minmax_ukernel_up8x9__neon,
296       xnn_init_f32_minmax_scalar_params,
297       8 /* channel tile */, 9 /* primary tile */,
298       benchmark::utils::CheckNEON);
299   }
f32_dwconv_8x9__neonfma_acc2(benchmark::State & state,const char * net)300   static void f32_dwconv_8x9__neonfma_acc2(benchmark::State& state, const char* net) {
301     DWConvBenchmark(state,
302       xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma_acc2,
303       xnn_init_f32_minmax_scalar_params,
304       8 /* channel tile */, 9 /* primary tile */,
305       benchmark::utils::CheckNEONFMA);
306   }
f32_dwconv_8x9__neonfma(benchmark::State & state,const char * net)307   static void f32_dwconv_8x9__neonfma(benchmark::State& state, const char* net) {
308     DWConvBenchmark(state,
309       xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma,
310       xnn_init_f32_minmax_scalar_params,
311       8 /* channel tile */, 9 /* primary tile */,
312       benchmark::utils::CheckNEONFMA);
313   }
f32_dwconv_8x25__neon_acc2(benchmark::State & state,const char * net)314   static void f32_dwconv_8x25__neon_acc2(benchmark::State& state, const char* net) {
315     DWConvBenchmark(state,
316       xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2,
317       xnn_init_f32_minmax_scalar_params,
318       8 /* channel tile */, 25 /* primary tile */,
319       benchmark::utils::CheckNEON);
320   }
f32_dwconv_8x25__neon(benchmark::State & state,const char * net)321   static void f32_dwconv_8x25__neon(benchmark::State& state, const char* net) {
322     DWConvBenchmark(state,
323       xnn_f32_dwconv_minmax_ukernel_up8x25__neon,
324       xnn_init_f32_minmax_scalar_params,
325       8 /* channel tile */, 25 /* primary tile */,
326       benchmark::utils::CheckNEON);
327   }
f32_dwconv_8x25__neonfma_acc2(benchmark::State & state,const char * net)328   static void f32_dwconv_8x25__neonfma_acc2(benchmark::State& state, const char* net) {
329     DWConvBenchmark(state,
330       xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2,
331       xnn_init_f32_minmax_scalar_params,
332       8 /* channel tile */, 25 /* primary tile */,
333       benchmark::utils::CheckNEONFMA);
334   }
f32_dwconv_8x25__neonfma(benchmark::State & state,const char * net)335   static void f32_dwconv_8x25__neonfma(benchmark::State& state, const char* net) {
336     DWConvBenchmark(state,
337       xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma,
338       xnn_init_f32_minmax_scalar_params,
339       8 /* channel tile */, 25 /* primary tile */,
340       benchmark::utils::CheckNEONFMA);
341   }
f32_dwconv_16x4__neon_acc2(benchmark::State & state,const char * net)342   static void f32_dwconv_16x4__neon_acc2(benchmark::State& state, const char* net) {
343     DWConvBenchmark(state,
344       xnn_f32_dwconv_minmax_ukernel_up16x4__neon_acc2,
345       xnn_init_f32_minmax_scalar_params,
346       16 /* channel tile */, 4 /* primary tile */,
347       benchmark::utils::CheckNEON);
348   }
f32_dwconv_16x4__neon(benchmark::State & state,const char * net)349   static void f32_dwconv_16x4__neon(benchmark::State& state, const char* net) {
350     DWConvBenchmark(state,
351       xnn_f32_dwconv_minmax_ukernel_up16x4__neon,
352       xnn_init_f32_minmax_scalar_params,
353       16 /* channel tile */, 4 /* primary tile */,
354       benchmark::utils::CheckNEON);
355   }
f32_dwconv_16x4__neonfma_acc2(benchmark::State & state,const char * net)356   static void f32_dwconv_16x4__neonfma_acc2(benchmark::State& state, const char* net) {
357     DWConvBenchmark(state,
358       xnn_f32_dwconv_minmax_ukernel_up16x4__neonfma_acc2,
359       xnn_init_f32_minmax_scalar_params,
360       16 /* channel tile */, 4 /* primary tile */,
361       benchmark::utils::CheckNEONFMA);
362   }
f32_dwconv_16x4__neonfma(benchmark::State & state,const char * net)363   static void f32_dwconv_16x4__neonfma(benchmark::State& state, const char* net) {
364     DWConvBenchmark(state,
365       xnn_f32_dwconv_minmax_ukernel_up16x4__neonfma,
366       xnn_init_f32_minmax_scalar_params,
367       16 /* channel tile */, 4 /* primary tile */,
368       benchmark::utils::CheckNEONFMA);
369   }
f32_dwconv_16x9__neon_acc2(benchmark::State & state,const char * net)370   static void f32_dwconv_16x9__neon_acc2(benchmark::State& state, const char* net) {
371     DWConvBenchmark(state,
372       xnn_f32_dwconv_minmax_ukernel_up16x9__neon_acc2,
373       xnn_init_f32_minmax_scalar_params,
374       16 /* channel tile */, 9 /* primary tile */,
375       benchmark::utils::CheckNEON);
376   }
f32_dwconv_16x9__neon(benchmark::State & state,const char * net)377   static void f32_dwconv_16x9__neon(benchmark::State& state, const char* net) {
378     DWConvBenchmark(state,
379       xnn_f32_dwconv_minmax_ukernel_up16x9__neon,
380       xnn_init_f32_minmax_scalar_params,
381       16 /* channel tile */, 9 /* primary tile */,
382       benchmark::utils::CheckNEON);
383   }
f32_dwconv_16x9__neonfma_acc2(benchmark::State & state,const char * net)384   static void f32_dwconv_16x9__neonfma_acc2(benchmark::State& state, const char* net) {
385     DWConvBenchmark(state,
386       xnn_f32_dwconv_minmax_ukernel_up16x9__neonfma_acc2,
387       xnn_init_f32_minmax_scalar_params,
388       16 /* channel tile */, 9 /* primary tile */,
389       benchmark::utils::CheckNEONFMA);
390   }
f32_dwconv_16x9__neonfma(benchmark::State & state,const char * net)391   static void f32_dwconv_16x9__neonfma(benchmark::State& state, const char* net) {
392     DWConvBenchmark(state,
393       xnn_f32_dwconv_minmax_ukernel_up16x9__neonfma,
394       xnn_init_f32_minmax_scalar_params,
395       16 /* channel tile */, 9 /* primary tile */,
396       benchmark::utils::CheckNEONFMA);
397   }
f32_dwconv_16x25__neon_acc2(benchmark::State & state,const char * net)398   static void f32_dwconv_16x25__neon_acc2(benchmark::State& state, const char* net) {
399     DWConvBenchmark(state,
400       xnn_f32_dwconv_minmax_ukernel_up16x25__neon_acc2,
401       xnn_init_f32_minmax_scalar_params,
402       16 /* channel tile */, 25 /* primary tile */,
403       benchmark::utils::CheckNEON);
404   }
f32_dwconv_16x25__neon(benchmark::State & state,const char * net)405   static void f32_dwconv_16x25__neon(benchmark::State& state, const char* net) {
406     DWConvBenchmark(state,
407       xnn_f32_dwconv_minmax_ukernel_up16x25__neon,
408       xnn_init_f32_minmax_scalar_params,
409       16 /* channel tile */, 25 /* primary tile */,
410       benchmark::utils::CheckNEON);
411   }
f32_dwconv_16x25__neonfma_acc2(benchmark::State & state,const char * net)412   static void f32_dwconv_16x25__neonfma_acc2(benchmark::State& state, const char* net) {
413     DWConvBenchmark(state,
414       xnn_f32_dwconv_minmax_ukernel_up16x25__neonfma_acc2,
415       xnn_init_f32_minmax_scalar_params,
416       16 /* channel tile */, 25 /* primary tile */,
417       benchmark::utils::CheckNEONFMA);
418   }
f32_dwconv_16x25__neonfma(benchmark::State & state,const char * net)419   static void f32_dwconv_16x25__neonfma(benchmark::State& state, const char* net) {
420     DWConvBenchmark(state,
421       xnn_f32_dwconv_minmax_ukernel_up16x25__neonfma,
422       xnn_init_f32_minmax_scalar_params,
423       16 /* channel tile */, 25 /* primary tile */,
424       benchmark::utils::CheckNEONFMA);
425   }
426   BENCHMARK_DWCONV(f32_dwconv_4x4__neonfma)
BENCHMARK_DWCONV(f32_dwconv_4x4__neonfma_acc2)427   BENCHMARK_DWCONV(f32_dwconv_4x4__neonfma_acc2)
428   BENCHMARK_DWCONV(f32_dwconv_8x4__neonfma)
429   BENCHMARK_DWCONV(f32_dwconv_8x4__neonfma_acc2)
430   BENCHMARK_DWCONV(f32_dwconv_16x4__neonfma)
431   BENCHMARK_DWCONV(f32_dwconv_16x4__neonfma_acc2)
432 
433   BENCHMARK_DWCONV(f32_dwconv_4x9__neonfma)
434   BENCHMARK_DWCONV(f32_dwconv_4x9__neonfma_acc2)
435   BENCHMARK_DWCONV(f32_dwconv_8x9__neonfma)
436   BENCHMARK_DWCONV(f32_dwconv_8x9__neonfma_acc2)
437   BENCHMARK_DWCONV(f32_dwconv_16x9__neonfma)
438   BENCHMARK_DWCONV(f32_dwconv_16x9__neonfma_acc2)
439 
440   BENCHMARK_DWCONV(f32_dwconv_4x25__neonfma)
441   BENCHMARK_DWCONV(f32_dwconv_4x25__neonfma_acc2)
442   BENCHMARK_DWCONV(f32_dwconv_8x25__neonfma)
443   BENCHMARK_DWCONV(f32_dwconv_8x25__neonfma_acc2)
444   BENCHMARK_DWCONV(f32_dwconv_16x25__neonfma)
445   BENCHMARK_DWCONV(f32_dwconv_16x25__neonfma_acc2)
446 
447   BENCHMARK_DWCONV(f32_dwconv_4x4__neon)
448   BENCHMARK_DWCONV(f32_dwconv_4x4__neon_acc2)
449   BENCHMARK_DWCONV(f32_dwconv_8x4__neon)
450   BENCHMARK_DWCONV(f32_dwconv_8x4__neon_acc2)
451   BENCHMARK_DWCONV(f32_dwconv_16x4__neon)
452   BENCHMARK_DWCONV(f32_dwconv_16x4__neon_acc2)
453 
454   BENCHMARK_DWCONV(f32_dwconv_4x9__neon)
455   BENCHMARK_DWCONV(f32_dwconv_4x9__neon_acc2)
456   BENCHMARK_DWCONV(f32_dwconv_8x9__neon)
457   BENCHMARK_DWCONV(f32_dwconv_8x9__neon_acc2)
458   BENCHMARK_DWCONV(f32_dwconv_16x9__neon)
459   BENCHMARK_DWCONV(f32_dwconv_16x9__neon_acc2)
460 
461   BENCHMARK_DWCONV(f32_dwconv_4x25__neon)
462   BENCHMARK_DWCONV(f32_dwconv_4x25__neon_acc2)
463   BENCHMARK_DWCONV(f32_dwconv_8x25__neon)
464   BENCHMARK_DWCONV(f32_dwconv_8x25__neon_acc2)
465   BENCHMARK_DWCONV(f32_dwconv_16x25__neon)
466   BENCHMARK_DWCONV(f32_dwconv_16x25__neon_acc2)
467 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
468 
469 
470 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
471   static void f32_dwconv_4x4__sse(benchmark::State& state, const char* net) {
472     DWConvBenchmark(state,
473       xnn_f32_dwconv_minmax_ukernel_up4x4__sse,
474       xnn_init_f32_minmax_sse_params,
475       4 /* channel tile */, 4 /* primary tile */);
476   }
f32_dwconv_4x9__sse(benchmark::State & state,const char * net)477   static void f32_dwconv_4x9__sse(benchmark::State& state, const char* net) {
478     DWConvBenchmark(state,
479       xnn_f32_dwconv_minmax_ukernel_up4x9__sse,
480       xnn_init_f32_minmax_sse_params,
481       4 /* channel tile */, 9 /* primary tile */);
482   }
f32_dwconv_4x25__sse(benchmark::State & state,const char * net)483   static void f32_dwconv_4x25__sse(benchmark::State& state, const char* net) {
484     DWConvBenchmark(state,
485       xnn_f32_dwconv_minmax_ukernel_up4x25__sse,
486       xnn_init_f32_minmax_sse_params,
487       4 /* channel tile */, 25 /* primary tile */);
488   }
489 
490   BENCHMARK_DWCONV(f32_dwconv_4x4__sse)
BENCHMARK_DWCONV(f32_dwconv_4x9__sse)491   BENCHMARK_DWCONV(f32_dwconv_4x9__sse)
492   BENCHMARK_DWCONV(f32_dwconv_4x25__sse)
493 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
494 
495 
496 static void f32_dwconv_1x4__scalar(benchmark::State& state, const char* net) {
497   DWConvBenchmark(state,
498     xnn_f32_dwconv_minmax_ukernel_up1x4__scalar,
499     xnn_init_f32_minmax_scalar_params,
500     1 /* channel tile */, 4 /* primary tile */);
501 }
f32_dwconv_1x4__scalar_acc2(benchmark::State & state,const char * net)502 static void f32_dwconv_1x4__scalar_acc2(benchmark::State& state, const char* net) {
503   DWConvBenchmark(state,
504     xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2,
505     xnn_init_f32_minmax_scalar_params,
506     1 /* channel tile */, 4 /* primary tile */);
507 }
f32_dwconv_2x4__scalar(benchmark::State & state,const char * net)508 static void f32_dwconv_2x4__scalar(benchmark::State& state, const char* net) {
509   DWConvBenchmark(state,
510     xnn_f32_dwconv_minmax_ukernel_up2x4__scalar,
511     xnn_init_f32_minmax_scalar_params,
512     2 /* channel tile */, 4 /* primary tile */);
513 }
f32_dwconv_2x4__scalar_acc2(benchmark::State & state,const char * net)514 static void f32_dwconv_2x4__scalar_acc2(benchmark::State& state, const char* net) {
515   DWConvBenchmark(state,
516     xnn_f32_dwconv_minmax_ukernel_up2x4__scalar_acc2,
517     xnn_init_f32_minmax_scalar_params,
518     2 /* channel tile */, 4 /* primary tile */);
519 }
f32_dwconv_1x9__scalar(benchmark::State & state,const char * net)520 static void f32_dwconv_1x9__scalar(benchmark::State& state, const char* net) {
521   DWConvBenchmark(state,
522     xnn_f32_dwconv_minmax_ukernel_up1x9__scalar,
523     xnn_init_f32_minmax_scalar_params,
524     1 /* channel tile */, 9 /* primary tile */);
525 }
f32_dwconv_1x9__scalar_acc2(benchmark::State & state,const char * net)526 static void f32_dwconv_1x9__scalar_acc2(benchmark::State& state, const char* net) {
527   DWConvBenchmark(state,
528     xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2,
529     xnn_init_f32_minmax_scalar_params,
530     1 /* channel tile */, 9 /* primary tile */);
531 }
f32_dwconv_2x9__scalar(benchmark::State & state,const char * net)532 static void f32_dwconv_2x9__scalar(benchmark::State& state, const char* net) {
533   DWConvBenchmark(state,
534     xnn_f32_dwconv_minmax_ukernel_up2x9__scalar,
535     xnn_init_f32_minmax_scalar_params,
536     2 /* channel tile */, 9 /* primary tile */);
537 }
f32_dwconv_2x9__scalar_acc2(benchmark::State & state,const char * net)538 static void f32_dwconv_2x9__scalar_acc2(benchmark::State& state, const char* net) {
539   DWConvBenchmark(state,
540     xnn_f32_dwconv_minmax_ukernel_up2x9__scalar_acc2,
541     xnn_init_f32_minmax_scalar_params,
542     2 /* channel tile */, 9 /* primary tile */);
543 }
f32_dwconv_1x25__scalar(benchmark::State & state,const char * net)544 static void f32_dwconv_1x25__scalar(benchmark::State& state, const char* net) {
545   DWConvBenchmark(state,
546     xnn_f32_dwconv_minmax_ukernel_up1x25__scalar,
547     xnn_init_f32_minmax_scalar_params,
548     1 /* channel tile */, 25 /* primary tile */);
549 }
f32_dwconv_1x25__scalar_acc2(benchmark::State & state,const char * net)550 static void f32_dwconv_1x25__scalar_acc2(benchmark::State& state, const char* net) {
551   DWConvBenchmark(state,
552     xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2,
553     xnn_init_f32_minmax_scalar_params,
554     1 /* channel tile */, 25 /* primary tile */);
555 }
f32_dwconv_2x25__scalar(benchmark::State & state,const char * net)556 static void f32_dwconv_2x25__scalar(benchmark::State& state, const char* net) {
557   DWConvBenchmark(state,
558     xnn_f32_dwconv_minmax_ukernel_up1x25__scalar,
559     xnn_init_f32_minmax_scalar_params,
560     2 /* channel tile */, 25 /* primary tile */);
561 }
f32_dwconv_2x25__scalar_acc2(benchmark::State & state,const char * net)562 static void f32_dwconv_2x25__scalar_acc2(benchmark::State& state, const char* net) {
563   DWConvBenchmark(state,
564     xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2,
565     xnn_init_f32_minmax_scalar_params,
566     2 /* channel tile */, 25 /* primary tile */);
567 }
568 
569 BENCHMARK_DWCONV(f32_dwconv_1x4__scalar)
570 BENCHMARK_DWCONV(f32_dwconv_1x4__scalar_acc2)
571 BENCHMARK_DWCONV(f32_dwconv_2x4__scalar)
572 BENCHMARK_DWCONV(f32_dwconv_2x4__scalar_acc2)
573 
574 BENCHMARK_DWCONV(f32_dwconv_1x9__scalar)
575 BENCHMARK_DWCONV(f32_dwconv_1x9__scalar_acc2)
576 BENCHMARK_DWCONV(f32_dwconv_2x9__scalar)
577 BENCHMARK_DWCONV(f32_dwconv_2x9__scalar_acc2)
578 
579 BENCHMARK_DWCONV(f32_dwconv_1x25__scalar)
580 BENCHMARK_DWCONV(f32_dwconv_1x25__scalar_acc2)
581 BENCHMARK_DWCONV(f32_dwconv_2x25__scalar)
582 BENCHMARK_DWCONV(f32_dwconv_2x25__scalar_acc2)
583 
584 
585 #ifndef XNNPACK_BENCHMARK_NO_MAIN
586 BENCHMARK_MAIN();
587 #endif
588