• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <limits>
14 #include <ostream>
15 #include <random>
16 #include <string>
17 #include <vector>
18 
19 #include <xnnpack.h>
20 
21 #ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
22 #include "arm_compute/core/Types.h"
23 #include "arm_compute/runtime/Tensor.h"
24 #include "arm_compute/runtime/CPP/CPPScheduler.h"
25 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
26 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
27 #endif  // BENCHMARK_ARM_COMPUTE_LIBRARY
28 #include <benchmark/benchmark.h>
29 #include <fp16.h>
30 #ifdef BENCHMARK_TENSORFLOW_LITE
31 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
32 #include "tensorflow/lite/interpreter.h"
33 #include "tensorflow/lite/kernels/register.h"
34 #include "tensorflow/lite/model.h"
35 #include "tensorflow/lite/schema/schema_generated.h"
36 #include "tensorflow/lite/version.h"
37 #endif  // BENCHMARK_TENSORFLOW_LITE
38 #include "bench/utils.h"
39 
40 #ifndef XNN_NO_QU8_OPERATORS
xnnpack_convolution_qu8(benchmark::State & state,const char * net)41 void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
42   const size_t batch_size = state.range(0);
43   const size_t input_height = state.range(1);
44   const size_t input_width = state.range(2);
45   const size_t kernel_height = state.range(3);
46   const size_t kernel_width = state.range(4);
47   const size_t padding_height = state.range(5);
48   const size_t padding_width = state.range(6);
49   const size_t subsampling = state.range(7);
50   const size_t dilation = state.range(8);
51   const size_t groups = state.range(9);
52   const size_t group_input_channels = state.range(10);
53   const size_t group_output_channels = state.range(11);
54 
55   std::random_device random_device;
56   auto rng = std::mt19937(random_device());
57   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
58   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
59 
60   const size_t output_pixel_stride = groups * group_output_channels;
61   const size_t input_pixel_stride = groups * group_input_channels;
62   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
63   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
64   const size_t padding_left = padding_width / 2;
65   const size_t padding_top = padding_height / 2;
66   const size_t padding_right = padding_width - padding_left;
67   const size_t padding_bottom = padding_height - padding_top;
68   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
69   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
70 
71   std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
72   std::generate(input.begin(), input.end(), std::ref(u8rng));
73   std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
74   std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
75   std::vector<int32_t> bias(groups * group_output_channels);
76   std::generate(bias.begin(), bias.end(), std::ref(i32rng));
77   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
78 
79   xnn_status status = xnn_initialize(nullptr /* allocator */);
80   if (status != xnn_status_success) {
81     state.SkipWithError("failed to initialize XNNPACK");
82     return;
83   }
84 
85   const size_t num_buffers = 1 +
86     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
87       sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
88   std::vector<uint8_t> output(output_elements * num_buffers);
89 
90   std::vector<xnn_operator_t> convolution_operators(num_buffers);
91   for (xnn_operator_t& convolution_op : convolution_operators) {
92     status = xnn_create_convolution2d_nhwc_qu8(
93       padding_top, padding_right, padding_bottom, padding_left,
94       kernel_height, kernel_width,
95       subsampling, subsampling,
96       dilation, dilation,
97       groups, group_input_channels, group_output_channels,
98       input_pixel_stride, output_pixel_stride,
99       127, 0.5f,
100       127, 0.5f,
101       kernel.data(), bias.data(),
102       127, 0.5f, 0, 255,
103       0 /* flags */, &convolution_op);
104     if (status != xnn_status_success) {
105       state.SkipWithError("failed to create QUINT8 Convolution operator");
106       return;
107     }
108   }
109 
110   for (size_t i = 0; i < convolution_operators.size(); i++) {
111     status = xnn_setup_convolution2d_nhwc_qu8(
112       convolution_operators[i],
113       batch_size, input_height, input_width,
114       input.data(), output.data() + i * output_elements,
115       nullptr /* thread pool */);
116     if (status != xnn_status_success) {
117       state.SkipWithError("failed to setup QUINT8 Convolution operator");
118       return;
119     }
120   }
121 
122   size_t buffer_index = 0;
123   for (auto _ : state) {
124     state.PauseTiming();
125     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
126     buffer_index = (buffer_index + 1) % num_buffers;
127     state.ResumeTiming();
128 
129     status = xnn_run_operator(convolution_operators[buffer_index],
130       nullptr /* thread pool */);
131     if (status != xnn_status_success) {
132       state.SkipWithError("failed to run QUINT8 Convolution operator");
133       return;
134     }
135   }
136 
137   for (xnn_operator_t& convolution_op : convolution_operators) {
138     status = xnn_delete_operator(convolution_op);
139     if (status != xnn_status_success) {
140       state.SkipWithError("failed to delete QUINT8 Convolution operator");
141       return;
142     }
143     convolution_op = nullptr;
144   }
145 
146   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
147   if (cpu_frequency != 0) {
148     state.counters["cpufreq"] = cpu_frequency;
149   }
150 
151   state.counters["OPS"] = benchmark::Counter(
152     uint64_t(state.iterations()) * 2 *
153       batch_size * output_height * output_width *
154       groups * group_input_channels * group_output_channels *
155       kernel_height * kernel_width,
156     benchmark::Counter::kIsRate);
157 }
158 #endif  // XNN_NO_QU8_OPERATORS
159 
160 #ifndef XNN_NO_QS8_OPERATORS
xnnpack_convolution_qs8(benchmark::State & state,const char * net)161 void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {
162   const size_t batch_size = state.range(0);
163   const size_t input_height = state.range(1);
164   const size_t input_width = state.range(2);
165   const size_t kernel_height = state.range(3);
166   const size_t kernel_width = state.range(4);
167   const size_t padding_height = state.range(5);
168   const size_t padding_width = state.range(6);
169   const size_t subsampling = state.range(7);
170   const size_t dilation = state.range(8);
171   const size_t groups = state.range(9);
172   const size_t group_input_channels = state.range(10);
173   const size_t group_output_channels = state.range(11);
174 
175   std::random_device random_device;
176   auto rng = std::mt19937(random_device());
177   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
178   auto i8rng = std::bind(
179     std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
180 
181   const size_t output_pixel_stride = groups * group_output_channels;
182   const size_t input_pixel_stride = groups * group_input_channels;
183   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
184   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
185   const size_t padding_left = padding_width / 2;
186   const size_t padding_top = padding_height / 2;
187   const size_t padding_right = padding_width - padding_left;
188   const size_t padding_bottom = padding_height - padding_top;
189   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
190   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
191 
192   std::vector<int8_t> input(batch_size * input_height * input_width * input_pixel_stride);
193   std::generate(input.begin(), input.end(), std::ref(i8rng));
194   std::vector<int8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
195   std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
196   std::vector<int32_t> bias(groups * group_output_channels);
197   std::generate(bias.begin(), bias.end(), std::ref(i32rng));
198   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
199 
200   xnn_status status = xnn_initialize(nullptr /* allocator */);
201   if (status != xnn_status_success) {
202     state.SkipWithError("failed to initialize XNNPACK");
203     return;
204   }
205 
206   const size_t num_buffers = 1 +
207     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
208       sizeof(int8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(int8_t) * output_elements);
209   std::vector<int8_t> output(output_elements * num_buffers);
210 
211   std::vector<xnn_operator_t> convolution_operators(num_buffers);
212   for (xnn_operator_t& convolution_op : convolution_operators) {
213     status = xnn_create_convolution2d_nhwc_qs8(
214       padding_top, padding_right, padding_bottom, padding_left,
215       kernel_height, kernel_width,
216       subsampling, subsampling,
217       dilation, dilation,
218       groups, group_input_channels, group_output_channels,
219       input_pixel_stride, output_pixel_stride,
220       127, 0.5f, 0.5f,
221       kernel.data(), bias.data(),
222       127, 0.5f, -128, 127,
223       0 /* flags */, &convolution_op);
224     if (status != xnn_status_success) {
225       state.SkipWithError("failed to create QINT8 Convolution operator");
226       return;
227     }
228   }
229 
230   for (size_t i = 0; i < convolution_operators.size(); i++) {
231     status = xnn_setup_convolution2d_nhwc_qs8(
232       convolution_operators[i],
233       batch_size, input_height, input_width,
234       input.data(), output.data() + i * output_elements,
235       nullptr /* thread pool */);
236     if (status != xnn_status_success) {
237       state.SkipWithError("failed to setup QINT8 Convolution operator");
238       return;
239     }
240   }
241 
242   size_t buffer_index = 0;
243   for (auto _ : state) {
244     state.PauseTiming();
245     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
246     buffer_index = (buffer_index + 1) % num_buffers;
247     state.ResumeTiming();
248 
249     status = xnn_run_operator(convolution_operators[buffer_index],
250       nullptr /* thread pool */);
251     if (status != xnn_status_success) {
252       state.SkipWithError("failed to run QINT8 Convolution operator");
253       return;
254     }
255   }
256 
257   for (xnn_operator_t& convolution_op : convolution_operators) {
258     status = xnn_delete_operator(convolution_op);
259     if (status != xnn_status_success) {
260       state.SkipWithError("failed to delete QINT8 Convolution operator");
261       return;
262     }
263     convolution_op = nullptr;
264   }
265 
266   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
267   if (cpu_frequency != 0) {
268     state.counters["cpufreq"] = cpu_frequency;
269   }
270 
271   state.counters["OPS"] = benchmark::Counter(
272     uint64_t(state.iterations()) * 2 *
273       batch_size * output_height * output_width *
274       groups * group_input_channels * group_output_channels *
275       kernel_height * kernel_width,
276     benchmark::Counter::kIsRate);
277 }
278 #endif  // XNN_NO_QS8_OPERATORS
279 
280 #ifndef XNN_NO_F16_OPERATORS
xnnpack_convolution_f16(benchmark::State & state,const char * net)281 void xnnpack_convolution_f16(benchmark::State& state, const char* net) {
282   if (!benchmark::utils::CheckNEONFP16ARITH(state)) {
283     return;
284   }
285   const size_t batch_size = state.range(0);
286   const size_t input_height = state.range(1);
287   const size_t input_width = state.range(2);
288   const size_t kernel_height = state.range(3);
289   const size_t kernel_width = state.range(4);
290   const size_t padding_height = state.range(5);
291   const size_t padding_width = state.range(6);
292   const size_t subsampling = state.range(7);
293   const size_t dilation = state.range(8);
294   const size_t groups = state.range(9);
295   const size_t group_input_channels = state.range(10);
296   const size_t group_output_channels = state.range(11);
297 
298   std::random_device random_device;
299   auto rng = std::mt19937(random_device());
300   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
301   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
302 
303   const size_t output_pixel_stride = groups * group_output_channels;
304   const size_t input_pixel_stride = groups * group_input_channels;
305   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
306   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
307   const size_t padding_left = padding_width / 2;
308   const size_t padding_top = padding_height / 2;
309   const size_t padding_right = padding_width - padding_left;
310   const size_t padding_bottom = padding_height - padding_top;
311   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
312   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
313 
314   std::vector<uint16_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
315   std::generate(input.begin(), input.end(), std::ref(f16rng));
316   std::vector<uint16_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
317   std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
318   std::vector<uint16_t> bias(groups * group_output_channels);
319   std::generate(bias.begin(), bias.end(), std::ref(f16rng));
320   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
321 
322   xnn_status status = xnn_initialize(nullptr /* allocator */);
323   if (status != xnn_status_success) {
324     state.SkipWithError("failed to initialize XNNPACK");
325     return;
326   }
327 
328   const size_t num_buffers = 1 +
329     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
330       sizeof(uint16_t) * (kernel.size() + bias.size() + output_elements));
331   std::vector<uint16_t> output(output_elements * num_buffers);
332 
333   std::vector<xnn_operator_t> convolution_operators(num_buffers);
334   for (xnn_operator_t& convolution_op : convolution_operators) {
335     status = xnn_create_convolution2d_nhwc_f16(
336       padding_top, padding_right, padding_bottom, padding_left,
337       kernel_height, kernel_width,
338       subsampling, subsampling,
339       dilation, dilation,
340       groups, group_input_channels, group_output_channels,
341       input_pixel_stride, output_pixel_stride,
342       kernel.data(), bias.data(),
343       -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
344       0 /* flags */, &convolution_op);
345     if (status != xnn_status_success) {
346       state.SkipWithError("failed to create FP16 Convolution operator");
347       return;
348     }
349   }
350 
351   for (size_t i = 0; i < convolution_operators.size(); i++) {
352     status = xnn_setup_convolution2d_nhwc_f16(
353       convolution_operators[i],
354       batch_size, input_height, input_width,
355       input.data(), output.data() + i * output_elements,
356       nullptr /* thread pool */);
357     if (status != xnn_status_success) {
358       state.SkipWithError("failed to setup FP16 Convolution operator");
359       return;
360     }
361   }
362 
363   size_t buffer_index = 0;
364   for (auto _ : state) {
365     state.PauseTiming();
366     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
367     buffer_index = (buffer_index + 1) % num_buffers;
368     state.ResumeTiming();
369 
370     status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
371     if (status != xnn_status_success) {
372       state.SkipWithError("failed to run FP16 Convolution operator");
373       return;
374     }
375   }
376 
377   for (xnn_operator_t& convolution_op : convolution_operators) {
378     status = xnn_delete_operator(convolution_op);
379     if (status != xnn_status_success) {
380       state.SkipWithError("failed to delete FP16 Convolution operator");
381       return;
382     }
383     convolution_op = nullptr;
384   }
385 
386   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
387   if (cpu_frequency != 0) {
388     state.counters["cpufreq"] = cpu_frequency;
389   }
390 
391   state.counters["FLOPS"] = benchmark::Counter(
392     uint64_t(state.iterations()) * 2 *
393       batch_size * output_height * output_width *
394       groups * group_input_channels * group_output_channels *
395       kernel_height * kernel_width,
396     benchmark::Counter::kIsRate);
397 }
398 #endif  // XNN_NO_F16_OPERATORS
399 
xnnpack_convolution_f32(benchmark::State & state,const char * net)400 void xnnpack_convolution_f32(benchmark::State& state, const char* net) {
401   const size_t batch_size = state.range(0);
402   const size_t input_height = state.range(1);
403   const size_t input_width = state.range(2);
404   const size_t kernel_height = state.range(3);
405   const size_t kernel_width = state.range(4);
406   const size_t padding_height = state.range(5);
407   const size_t padding_width = state.range(6);
408   const size_t subsampling = state.range(7);
409   const size_t dilation = state.range(8);
410   const size_t groups = state.range(9);
411   const size_t group_input_channels = state.range(10);
412   const size_t group_output_channels = state.range(11);
413 
414   std::random_device random_device;
415   auto rng = std::mt19937(random_device());
416   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
417 
418   const size_t output_pixel_stride = groups * group_output_channels;
419   const size_t input_pixel_stride = groups * group_input_channels;
420   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
421   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
422   const size_t padding_left = padding_width / 2;
423   const size_t padding_top = padding_height / 2;
424   const size_t padding_right = padding_width - padding_left;
425   const size_t padding_bottom = padding_height - padding_top;
426   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
427   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
428 
429   std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
430   std::generate(input.begin(), input.end(), std::ref(f32rng));
431   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
432   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
433   std::vector<float> bias(groups * group_output_channels);
434   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
435   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
436 
437   xnn_status status = xnn_initialize(nullptr /* allocator */);
438   if (status != xnn_status_success) {
439     state.SkipWithError("failed to initialize XNNPACK");
440     return;
441   }
442 
443   const size_t num_buffers = 1 +
444     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
445       sizeof(float) * (kernel.size() + bias.size() + output_elements));
446   std::vector<float> output(output_elements * num_buffers);
447 
448   std::vector<xnn_operator_t> convolution_operators(num_buffers);
449   for (xnn_operator_t& convolution_op : convolution_operators) {
450     status = xnn_create_convolution2d_nhwc_f32(
451       padding_top, padding_right, padding_bottom, padding_left,
452       kernel_height, kernel_width,
453       subsampling, subsampling,
454       dilation, dilation,
455       groups, group_input_channels, group_output_channels,
456       input_pixel_stride, output_pixel_stride,
457       kernel.data(), bias.data(),
458       -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
459       0 /* flags */, &convolution_op);
460     if (status != xnn_status_success) {
461       state.SkipWithError("failed to create FP32 Convolution operator");
462       return;
463     }
464   }
465 
466   for (size_t i = 0; i < convolution_operators.size(); i++) {
467     status = xnn_setup_convolution2d_nhwc_f32(
468       convolution_operators[i],
469       batch_size, input_height, input_width,
470       input.data(), output.data() + i * output_elements,
471       nullptr /* thread pool */);
472     if (status != xnn_status_success) {
473       state.SkipWithError("failed to setup FP32 Convolution operator");
474       return;
475     }
476   }
477 
478   size_t buffer_index = 0;
479   for (auto _ : state) {
480     state.PauseTiming();
481     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
482     buffer_index = (buffer_index + 1) % num_buffers;
483     state.ResumeTiming();
484 
485     status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
486     if (status != xnn_status_success) {
487       state.SkipWithError("failed to run FP32 Convolution operator");
488       return;
489     }
490   }
491 
492   for (xnn_operator_t& convolution_op : convolution_operators) {
493     status = xnn_delete_operator(convolution_op);
494     if (status != xnn_status_success) {
495       state.SkipWithError("failed to delete FP32 Convolution operator");
496       return;
497     }
498     convolution_op = nullptr;
499   }
500 
501   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
502   if (cpu_frequency != 0) {
503     state.counters["cpufreq"] = cpu_frequency;
504   }
505 
506   state.counters["FLOPS"] = benchmark::Counter(
507     uint64_t(state.iterations()) * 2 *
508       batch_size * output_height * output_width *
509       groups * group_input_channels * group_output_channels *
510       kernel_height * kernel_width,
511     benchmark::Counter::kIsRate);
512 }
513 
514 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_convolution_f32(benchmark::State & state,const char * net)515 void tflite_convolution_f32(benchmark::State& state, const char* net) {
516   const size_t batch_size = state.range(0);
517   const size_t input_height = state.range(1);
518   const size_t input_width = state.range(2);
519   const size_t kernel_height = state.range(3);
520   const size_t kernel_width = state.range(4);
521   const size_t padding_height = state.range(5);
522   const size_t padding_width = state.range(6);
523   const size_t subsampling = state.range(7);
524   const size_t dilation = state.range(8);
525   const size_t groups = state.range(9);
526   const size_t group_input_channels = state.range(10);
527   const size_t group_output_channels = state.range(11);
528 
529   bool is_depthwise = false;
530   if (groups != 1) {
531     if (group_input_channels == 1) {
532       is_depthwise = true;
533     } else {
534       state.SkipWithError("grouped convolution is not supported");
535       return;
536     }
537   }
538 
539   std::random_device random_device;
540   auto rng = std::mt19937(random_device());
541   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
542 
543   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
544   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
545 
546   tflite::Padding padding = tflite::Padding_VALID;
547   if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {
548     padding = tflite::Padding_SAME;
549   } else if (padding_width == 0 && padding_height == 0) {
550     padding = tflite::Padding_VALID;
551   } else {
552     state.SkipWithError("unsupported padding");
553     return;
554   }
555 
556   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
557   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
558 
559   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
560   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
561   std::vector<float> bias(groups * group_output_channels);
562   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
563 
564   flatbuffers::FlatBufferBuilder builder;
565   flatbuffers::Offset<tflite::OperatorCode> operator_code =
566       CreateOperatorCode(
567         builder,
568         is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,
569         0);
570 
571   flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(
572       builder,
573       padding,
574       static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
575       tflite::ActivationFunctionType_NONE,
576       static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
577 
578   flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(
579       builder,
580       padding,
581       static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
582       static_cast<int32_t>(group_output_channels),
583       tflite::ActivationFunctionType_NONE,
584       static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
585 
586   flatbuffers::Offset<tflite::Buffer> buffers[3] = {
587     tflite::CreateBuffer(builder, builder.CreateVector({})),
588     tflite::CreateBuffer(builder, builder.CreateVector(
589       reinterpret_cast<const uint8_t*>(kernel.data()),
590       sizeof(float) * kernel.size())),
591     tflite::CreateBuffer(builder, builder.CreateVector(
592       reinterpret_cast<const uint8_t*>(bias.data()),
593       sizeof(float) * bias.size())),
594   };
595 
596   const int32_t input_shape[4] = {
597     static_cast<int32_t>(batch_size),
598     static_cast<int32_t>(input_height),
599     static_cast<int32_t>(input_width),
600     static_cast<int32_t>(groups * group_input_channels)
601   };
602   const int32_t output_shape[4] = {
603     static_cast<int32_t>(batch_size),
604     static_cast<int32_t>(output_height),
605     static_cast<int32_t>(output_width),
606     static_cast<int32_t>(groups * group_output_channels)
607   };
608   const int32_t filter_shape[4] = {
609     static_cast<int32_t>(group_output_channels),
610     static_cast<int32_t>(kernel_height),
611     static_cast<int32_t>(kernel_width),
612     static_cast<int32_t>(groups * group_input_channels)
613   };
614   const int32_t bias_shape[1] = {
615     static_cast<int32_t>(groups * group_output_channels)
616   };
617 
618   flatbuffers::Offset<tflite::Tensor> tensors[4] = {
619     tflite::CreateTensor(builder,
620                          builder.CreateVector<int32_t>(input_shape, 4),
621                          tflite::TensorType_FLOAT32,
622                          0 /* buffer id */,
623                          builder.CreateString("input")),
624     tflite::CreateTensor(builder,
625                          builder.CreateVector<int32_t>(filter_shape, 4),
626                          tflite::TensorType_FLOAT32,
627                          1 /* buffer id */,
628                          builder.CreateString("filter")),
629     tflite::CreateTensor(builder,
630                          builder.CreateVector<int32_t>(bias_shape, 1),
631                          tflite::TensorType_FLOAT32,
632                          2 /* buffer id */,
633                          builder.CreateString("bias")),
634     tflite::CreateTensor(builder,
635                          builder.CreateVector<int32_t>(output_shape, 4),
636                          tflite::TensorType_FLOAT32,
637                          0 /* buffer id */,
638                          builder.CreateString("output")),
639   };
640 
641   const int32_t op_inputs[3] = { 0, 1, 2 };
642   const int32_t op_outputs[1] = { 3 };
643   flatbuffers::Offset<tflite::Operator> op = CreateOperator(
644       builder,
645       0 /* opcode_index */,
646       builder.CreateVector<int32_t>(op_inputs, 3),
647       builder.CreateVector<int32_t>(op_outputs, 1),
648       is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,
649       is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),
650       /*custom_options */ 0,
651       tflite::CustomOptionsFormat_FLEXBUFFERS);
652 
653   const int32_t graph_inputs[1] = { 0 };
654   const int32_t graph_outputs[1] = { 3 };
655   flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
656       builder,
657       builder.CreateVector(tensors, 4),
658       builder.CreateVector<int32_t>(graph_inputs, 1),
659       builder.CreateVector<int32_t>(graph_outputs, 1),
660       builder.CreateVector(&op, 1),
661       builder.CreateString("Conv2D subgraph"));
662 
663   flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");
664 
665   flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
666       TFLITE_SCHEMA_VERSION,
667       builder.CreateVector(&operator_code, 1),
668       builder.CreateVector(&subgraph, 1),
669       description,
670       builder.CreateVector(buffers, 3));
671 
672   builder.Finish(model_buffer);
673 
674   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
675   tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
676   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
677   std::unique_ptr<tflite::Interpreter> interpreter;
678   if (interpreterBuilder(&interpreter) != kTfLiteOk) {
679     state.SkipWithError("failed to create TFLite interpreter");
680     return;
681   }
682   if (interpreter == nullptr) {
683     state.SkipWithError("TFLite interpreter is null");
684     return;
685   }
686   interpreter->SetNumThreads(1);
687 
688   if (interpreter->AllocateTensors() != kTfLiteOk) {
689     state.SkipWithError("failed to allocate tensors");
690     return;
691   }
692 
693   std::generate(
694     interpreter->typed_tensor<float>(0),
695     interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,
696     std::ref(f32rng));
697 
698   for (auto _ : state) {
699     state.PauseTiming();
700     benchmark::utils::WipeCache();
701     benchmark::utils::PrefetchToL1(
702       interpreter->typed_tensor<float>(0),
703       batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
704     state.ResumeTiming();
705 
706     if (interpreter->Invoke() != kTfLiteOk) {
707       state.SkipWithError("failed to invoke TFLite interpreter");
708       return;
709     }
710   }
711 
712   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
713   if (cpu_frequency != 0) {
714     state.counters["cpufreq"] = cpu_frequency;
715   }
716 
717   state.counters["FLOPS"] = benchmark::Counter(
718     uint64_t(state.iterations()) * 2 *
719       batch_size * output_height * output_width *
720       groups * group_input_channels * group_output_channels *
721       kernel_height * kernel_width,
722     benchmark::Counter::kIsRate);
723 
724   interpreter.reset();
725 }
726 #endif  // BENCHMARK_TENSORFLOW_LITE
727 
728 #ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
compare_with_convolution_f32_reference_output(const benchmark::State & state,const float * input,size_t input_size,const float * kernel,size_t kernel_size,const float * bias,size_t bias_size,const float * output,size_t output_size)729 static std::string compare_with_convolution_f32_reference_output(
730     const benchmark::State& state, const float* input, size_t input_size,
731     const float* kernel, size_t kernel_size, const float* bias, size_t bias_size,
732     const float* output, size_t output_size)
733 {
734   const size_t batch_size = state.range(0);
735   const size_t input_height = state.range(1);
736   const size_t input_width = state.range(2);
737   const size_t kernel_height = state.range(3);
738   const size_t kernel_width = state.range(4);
739   const size_t padding_height = state.range(5);
740   const size_t padding_width = state.range(6);
741   const size_t subsampling = state.range(7);
742   const size_t dilation = state.range(8);
743   const size_t groups = state.range(9);
744   const size_t group_input_channels = state.range(10);
745   const size_t group_output_channels = state.range(11);
746 
747   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
748   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
749   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
750   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
751   const size_t input_pixel_stride = groups * group_input_channels;
752   const size_t padding_left = padding_width / 2;
753   const size_t padding_top = padding_height / 2;
754 
755   assert(input_size == batch_size * input_height * input_width * groups * group_input_channels);
756 
757   assert(kernel_size == group_output_channels * kernel_height * kernel_width * groups * group_input_channels);
758 
759   assert(bias_size == groups * group_output_channels);
760 
761   assert(output_size == batch_size * output_height * output_width * groups * group_output_channels);
762 
763   std::vector<float> output_ref(output_size);
764   for (size_t i = 0; i < batch_size; i++) {
765     for (size_t oy = 0; oy < output_height; oy++) {
766       for (size_t ox = 0; ox < output_width; ox++) {
767         for (size_t g = 0; g < groups; g++) {
768           for (size_t oc = 0; oc < group_output_channels; oc++) {
769             output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] =
770               bias[g * group_output_channels + oc];
771           }
772         }
773       }
774     }
775   }
776   for (size_t i = 0; i < batch_size; i++) {
777     for (size_t oy = 0; oy < output_height; oy++) {
778       for (size_t ox = 0; ox < output_width; ox++) {
779         for (size_t ky = 0; ky < kernel_height; ky++) {
780           const size_t iy = oy * subsampling + ky * dilation - padding_top;
781           if (iy < input_height) {
782             for (size_t kx = 0; kx < kernel_width; kx++) {
783               const size_t ix = ox * subsampling + kx * dilation - padding_left;
784               if (ix < input_width) {
785                 for (size_t g = 0; g < groups; g++) {
786                   for (size_t oc = 0; oc < group_output_channels; oc++) {
787                     for (size_t ic = 0; ic < group_input_channels; ic++) {
788                       output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] +=
789                         input[((i * input_height + iy) * input_width + ix) * input_pixel_stride + g * group_input_channels + ic] *
790                         kernel[(((oc * kernel_height + ky) * kernel_width + kx) * groups + g) * group_input_channels + ic];
791                     }  // group_input_channels loop
792                   }  // group_output_channels loop
793                 }  // groups loop
794               }
795             }  // kernel_width loop
796           }
797         }  // kernel_height loop
798       }  // output_width loop
799     }  // output_height loop
800   }  // batch_size loop
801 
802   const float relative_error_tolerance = 1e-4;
803   for (size_t i = 0; i < batch_size; i++) {
804     for (size_t y = 0; y < output_height; y++) {
805       for (size_t x = 0; x < output_width; x++) {
806         for (size_t g = 0; g < groups; g++) {
807           for (size_t c = 0; c < group_output_channels; c++) {
808             const size_t idx = (((i * output_height + y) * output_width + x) * groups + g) * group_output_channels + c;
809             const float value_ref = output_ref[idx];
810             const float value = output[idx];
811             if (std::abs(value - value_ref) > std::max(std::abs(value_ref) * relative_error_tolerance, std::numeric_limits<float>::epsilon())) {
812               std::ostringstream error_stream;
813               error_stream << "(x, y) = (" << x << ", " << y << "), group = " << g
814                        << ", channel = " << c << ", refValue = " << value_ref
815                        << ", actualValue = " << value
816                        << ", absDiff=" << std::abs(value - value_ref);
817               return error_stream.str();
818             }
819           }
820         }
821       }
822     }
823   }
824   return "";
825 }
826 
armcl_convolution_f32(benchmark::State & state,const char * net)827 void armcl_convolution_f32(benchmark::State& state, const char* net) {
828   const size_t batch_size = state.range(0);
829   const size_t input_height = state.range(1);
830   const size_t input_width = state.range(2);
831   const size_t kernel_height = state.range(3);
832   const size_t kernel_width = state.range(4);
833   const size_t padding_height = state.range(5);
834   const size_t padding_width = state.range(6);
835   const size_t subsampling = state.range(7);
836   const size_t dilation = state.range(8);
837   const size_t groups = state.range(9);
838   const size_t group_input_channels = state.range(10);
839   const size_t group_output_channels = state.range(11);
840 
841   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
842   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
843   const size_t padding_left = padding_width / 2;
844   const size_t padding_top = padding_height / 2;
845   const size_t padding_right = padding_width - padding_left;
846   const size_t padding_bottom = padding_height - padding_top;
847   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
848   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
849 
850   arm_compute::PadStrideInfo pad_stride_info(
851     subsampling /* stride height */,
852     subsampling /* stride width */,
853     padding_left, padding_right, padding_top, padding_bottom,
854     arm_compute::DimensionRoundingType::FLOOR);
855   arm_compute::Size2D dilation_info(dilation, dilation);
856   // Note: activation is disabled by default.
857   arm_compute::ActivationLayerInfo activation_info;
858 
859   // Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.
860   arm_compute::TensorShape input_shape(
861     /* C */ groups * group_input_channels,
862     /* W */ input_width,
863     /* H */ input_height,
864     /* N */ batch_size);
865   arm_compute::TensorInfo input_info(
866     input_shape,
867     1 /* number of channels per element (!) */,
868     arm_compute::DataType::F32);
869   input_info.set_data_layout(arm_compute::DataLayout::NHWC);
870   arm_compute::Tensor input_tensor;
871   input_tensor.allocator()->init(input_info);
872   input_tensor.allocator()->allocate();
873 
874   // Note: reverse order of dimensions, i.e. for IWHO for OHWI.
875   arm_compute::TensorShape kernel_shape(
876     /* I */ groups * group_input_channels,
877     /* W */ kernel_width,
878     /* H */ kernel_height,
879     /* O */ group_output_channels);
880   arm_compute::TensorInfo kernel_info(
881     kernel_shape,
882     1 /* number of channels per element (!) */,
883     arm_compute::DataType::F32);
884   kernel_info.set_data_layout(arm_compute::DataLayout::NHWC);
885   arm_compute::Tensor kernelTensor;
886   kernelTensor.allocator()->init(kernel_info);
887   kernelTensor.allocator()->allocate();
888 
889   arm_compute::TensorShape bias_shape(groups * group_output_channels);
890   arm_compute::TensorInfo bias_info(
891     bias_shape,
892     1 /* number of channels per element (!) */,
893     arm_compute::DataType::F32);
894   bias_info.set_data_layout(arm_compute::DataLayout::NHWC);
895   arm_compute::Tensor bias_tensor;
896   bias_tensor.allocator()->init(bias_info);
897   bias_tensor.allocator()->allocate();
898 
899   // Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.
900   arm_compute::TensorShape output_shape(
901     /* C */ groups * group_output_channels,
902     /* W */ output_width,
903     /* H */ output_height,
904     /* N */ batch_size);
905   arm_compute::TensorInfo output_info(
906     output_shape,
907     1 /* number of channels per element (!) */,
908     arm_compute::DataType::F32);
909   output_info.set_data_layout(arm_compute::DataLayout::NHWC);
910   arm_compute::Tensor output_tensor;
911   output_tensor.allocator()->init(output_info);
912   output_tensor.allocator()->allocate();
913 
914   std::random_device random_device;
915   auto rng = std::mt19937(random_device());
916   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
917 
918   std::generate(
919     reinterpret_cast<float*>(input_tensor.buffer()),
920     reinterpret_cast<float*>(input_tensor.buffer()) + input_shape.total_size(),
921     std::ref(f32rng));
922   std::generate(
923     reinterpret_cast<float*>(kernelTensor.buffer()),
924     reinterpret_cast<float*>(kernelTensor.buffer()) + kernel_shape.total_size(),
925     std::ref(f32rng));
926   std::generate(
927     reinterpret_cast<float*>(bias_tensor.buffer()),
928     reinterpret_cast<float*>(bias_tensor.buffer()) + bias_shape.total_size(),
929     std::ref(f32rng));
930   std::generate(
931     reinterpret_cast<float*>(output_tensor.buffer()),
932     reinterpret_cast<float*>(output_tensor.buffer()) + output_shape.total_size(),
933     std::ref(f32rng));
934 
935   bool is_depthwise = false;
936   if (groups != 1) {
937     // NEConvolutionLayer uses NEGEMMConvolutionLayer by default, which doesn't support grouped convolution.
938     // However, depthwise convolution is supported via NEDepthwiseConvolutionLayer.
939     if (group_input_channels == 1) {
940       is_depthwise = true;
941     } else {
942       state.SkipWithError("grouped convolution is not supported");
943       return;
944     }
945   }
946 
947   std::shared_ptr<arm_compute::IFunction> layer;
948   if (is_depthwise) {
949     if (dilation != 1) {
950       state.SkipWithError("dilated depthwise convolution is not supported");
951       return;
952     }
953 
954     // Avoid NEDepthwiseConvolutionLayer3x3 when stride isn't 2 in order to pass the output verification.
955     // TODO(b/130206370) This looks like a bug and needs further investigation.
956     if (kernel_height == 3 && kernel_width == 3 && subsampling == 2) {
957       auto* depthwise_3x3_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer3x3();
958       layer.reset(depthwise_3x3_convolution_layer);
959       depthwise_3x3_convolution_layer->configure(
960         &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
961         pad_stride_info, group_output_channels, activation_info);
962 
963       if (!depthwise_3x3_convolution_layer->validate(
964         &input_info, &kernel_info, &bias_info, &output_info,
965         pad_stride_info, group_output_channels, activation_info))
966       {
967         state.SkipWithError("validation failed");
968         return;
969       }
970     } else {
971       auto* depthwise_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer();
972       layer.reset(depthwise_convolution_layer);
973       depthwise_convolution_layer->configure(
974         &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
975         pad_stride_info, group_output_channels, activation_info);
976 
977       if (!depthwise_convolution_layer->validate(
978         &input_info, &kernel_info, &bias_info, &output_info,
979         pad_stride_info, group_output_channels, activation_info))
980       {
981         state.SkipWithError("validation failed");
982         return;
983       }
984     }
985   } else {
986     auto* convolution_layer = new arm_compute::NEConvolutionLayer();
987     layer.reset(convolution_layer);
988     convolution_layer->configure(
989       &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
990       pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,
991       true /* enable fast math */, groups);
992 
993     if (!convolution_layer->validate(
994       &input_info, &kernel_info, &bias_info, &output_info,
995       pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,
996       true /* enable fast math */, groups))
997     {
998       state.SkipWithError("validation failed");
999       return;
1000     }
1001   }
1002 
1003   // Dry run to let ACL do one-time initializations.
1004   arm_compute::CPPScheduler::get().set_num_threads(1);
1005   layer->run();
1006 
1007   for (auto _ : state) {
1008     state.PauseTiming();
1009     benchmark::utils::WipeCache();
1010     benchmark::utils::PrefetchToL1(
1011       input_tensor.buffer(),
1012       batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
1013     state.ResumeTiming();
1014 
1015     layer->run();
1016   }
1017 
1018   // Validate outputs.
1019   const std::string error_string = compare_with_convolution_f32_reference_output(
1020       state, reinterpret_cast<const float*>(input_tensor.buffer()),
1021       input_shape.total_size(),
1022       reinterpret_cast<const float*>(kernelTensor.buffer()),
1023       kernel_shape.total_size(),
1024       reinterpret_cast<const float*>(bias_tensor.buffer()),
1025       bias_shape.total_size(),
1026       reinterpret_cast<const float*>(output_tensor.buffer()),
1027       output_shape.total_size());
1028 
1029   if (!error_string.empty()) {
1030     state.SkipWithError(("validation failed: " + error_string).c_str());
1031     return;
1032   }
1033 
1034   input_tensor.allocator()->free();
1035   kernelTensor.allocator()->free();
1036   bias_tensor.allocator()->free();
1037   output_tensor.allocator()->free();
1038 
1039   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
1040   if (cpu_frequency != 0) {
1041     state.counters["cpufreq"] = cpu_frequency;
1042   }
1043 
1044   state.counters["FLOPS"] = benchmark::Counter(
1045     uint64_t(state.iterations()) * 2 *
1046       batch_size * output_height * output_width *
1047       groups * group_input_channels * group_output_channels *
1048       kernel_height * kernel_width,
1049     benchmark::Counter::kIsRate);
1050 }
1051 #endif  // BENCHMARK_ARM_COMPUTE_LIBRARY
1052 
1053 // ShuffleNet v1 with 1 group.
ShuffleNetV1G1(benchmark::internal::Benchmark * b)1054 static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
1055   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1056 
1057   /*************************** Conv 1 **************************/
1058   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1059   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1060   /******************* Stage 2: stride-2 unit ******************/
1061   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1062   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   36});
1063   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  36,    1,    1});
1064   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  120});
1065   /******************* Stage 2: stride-1 units *****************/
1066   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1067   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   36});
1068   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  36,    1,    1});
1069   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  144});
1070   /******************* Stage 3: stride-2 unit ******************/
1071   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1072   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   72});
1073   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  72,    1,    1});
1074   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  144});
1075   /******************* Stage 3: stride-1 units *****************/
1076   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1077   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,   72});
1078   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  72,    1,    1});
1079   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  288});
1080   /******************* Stage 4: stride-2 unit ******************/
1081   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1082   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,  144});
1083   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 144,    1,    1});
1084   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  288});
1085   /******************* Stage 4: stride-1 units *****************/
1086   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1087   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  144});
1088   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 144,    1,    1});
1089   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  576});
1090 }
1091 
1092 // ShuffleNet v1 with 2 groups.
ShuffleNetV1G2(benchmark::internal::Benchmark * b)1093 static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
1094   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1095 
1096   /*************************** Conv 1 **************************/
1097   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1098   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1099   /******************* Stage 2: stride-2 unit ******************/
1100   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1101   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   50});
1102   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  50,    1,    1});
1103   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,   88});
1104   /******************* Stage 2: stride-1 units *****************/
1105   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1106   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   25});
1107   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  50,    1,    1});
1108   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,  100});
1109   /******************* Stage 3: stride-2 unit ******************/
1110   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1111   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   50});
1112   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 100,    1,    1});
1113   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  100});
1114   /******************* Stage 3: stride-1 units *****************/
1115   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1116   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,   50});
1117   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 100,    1,    1});
1118   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  200});
1119   /******************* Stage 4: stride-2 unit ******************/
1120   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1121   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,  100});
1122   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 200,    1,    1});
1123   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  200});
1124   /******************* Stage 4: stride-1 units *****************/
1125   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1126   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  400,  100});
1127   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 200,    1,    1});
1128   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  400});
1129 }
1130 
1131 // ShuffleNet v1 with 3 groups.
ShuffleNetV1G3(benchmark::internal::Benchmark * b)1132 static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
1133   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1134 
1135   /*************************** Conv 1 **************************/
1136   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1137   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1138   /******************* Stage 2: stride-2 unit ******************/
1139   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1140   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   60});
1141   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  60,    1,    1});
1142   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   72});
1143   /******************* Stage 2: stride-1 units *****************/
1144   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1145   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   20});
1146   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  60,    1,    1});
1147   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   80});
1148   /******************* Stage 3: stride-2 unit ******************/
1149   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1150   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   40});
1151   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 120,    1,    1});
1152   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,   80});
1153   /******************* Stage 3: stride-1 units *****************/
1154   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1155   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   40});
1156   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 120,    1,    1});
1157   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,  160});
1158   /******************* Stage 4: stride-2 unit ******************/
1159   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1160   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   80});
1161   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 240,    1,    1});
1162   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  160});
1163   /******************* Stage 4: stride-1 units *****************/
1164   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1165   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,  320,   80});
1166   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 240,    1,    1});
1167   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  320});
1168 }
1169 
1170 // ShuffleNet v1 with 4 groups.
ShuffleNetV1G4(benchmark::internal::Benchmark * b)1171 static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
1172   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1173 
1174   /*************************** Conv 1 **************************/
1175   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1176   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1177   /******************* Stage 2: stride-2 unit ******************/
1178   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1179   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   68});
1180   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  68,    1,    1});
1181   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   62});
1182   /******************* Stage 2: stride-1 units *****************/
1183   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1184   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   17});
1185   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  68,    1,    1});
1186   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   68});
1187   /******************* Stage 3: stride-2 unit ******************/
1188   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1189   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   34});
1190   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 136,    1,    1});
1191   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,   68});
1192   /******************* Stage 3: stride-1 units *****************/
1193   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1194   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   34});
1195   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 136,    1,    1});
1196   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,  136});
1197   /******************* Stage 4: stride-2 unit ******************/
1198   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1199   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   68});
1200   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 272,    1,    1});
1201   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  136});
1202   /******************* Stage 4: stride-1 units *****************/
1203   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1204   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,  272,   68});
1205   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 272,    1,    1});
1206   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  272});
1207 }
1208 
1209 // ShuffleNet v1 with 8 groups.
ShuffleNetV1G8(benchmark::internal::Benchmark * b)1210 static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
1211   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1212 
1213   /*************************** Conv 1 **************************/
1214   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1215   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1216   /******************* Stage 2: stride-2 unit ******************/
1217   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1218   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   96});
1219   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  96,    1,    1});
1220   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   45});
1221   /******************* Stage 2: stride-1 units *****************/
1222   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1223   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   12});
1224   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  96,    1,    1});
1225   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   48});
1226   /******************* Stage 3: stride-2 unit ******************/
1227   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1228   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   24});
1229   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
1230   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   48});
1231   /******************* Stage 3: stride-1 units *****************/
1232   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1233   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   24});
1234   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 192,    1,    1});
1235   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   96});
1236   /******************* Stage 4: stride-2 unit ******************/
1237   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1238   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   48});
1239   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 384,    1,    1});
1240   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,   96});
1241   /******************* Stage 4: stride-1 units *****************/
1242   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1243   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,  192,   48});
1244   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 384,    1,    1});
1245   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,  192});
1246 }
1247 
1248 // ShuffleNet v2 (0.5X scale)
ShuffleNetV2X05(benchmark::internal::Benchmark * b)1249 static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {
1250   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1251 
1252   /*************************** Conv 1 **************************/
1253   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1254   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1255   /************************** Stage 2 **************************/
1256   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1257   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1258   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   24});
1259   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   24});
1260   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  24,    1,    1});
1261   /************************** Stage 3 **************************/
1262   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1263   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  48,    1,    1});
1264   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,   48});
1265   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   48,   48});
1266   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  48,    1,    1});
1267   /************************** Stage 4 **************************/
1268   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1269   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  96,    1,    1});
1270   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,   96});
1271   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   96});
1272   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1,  96,    1,    1});
1273   /*************************** Conv 5 **************************/
1274   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1275   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  192, 1024});
1276 }
1277 
1278 // ShuffleNet v2 (1.0X scale)
ShuffleNetV2X10(benchmark::internal::Benchmark * b)1279 static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {
1280   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1281 
1282   /*************************** Conv 1 **************************/
1283   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1284   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1285   /************************** Stage 2 **************************/
1286   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1287   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1288   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   58});
1289   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   58});
1290   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  58,    1,    1});
1291   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   58,   58});
1292   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  58,    1,    1});
1293   /************************** Stage 3 **************************/
1294   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1295   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 116,    1,    1});
1296   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  116,  116});
1297   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  116,  116});
1298   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 116,    1,    1});
1299   /************************** Stage 4 **************************/
1300   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1301   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 232,    1,    1});
1302   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  232,  232});
1303   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  232,  232});
1304   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 232,    1,    1});
1305   /*************************** Conv 5 **************************/
1306   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1307   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  464, 1024});
1308 }
1309 
1310 // ShuffleNet v2 (1.5X scale)
ShuffleNetV2X15(benchmark::internal::Benchmark * b)1311 static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {
1312   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1313 
1314   /*************************** Conv 1 **************************/
1315   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1316   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1317   /************************** Stage 2 **************************/
1318   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1319   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1320   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
1321   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   88});
1322   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  88,    1,    1});
1323   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   88});
1324   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
1325   /************************** Stage 3 **************************/
1326   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1327   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 176,    1,    1});
1328   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  176,  176});
1329   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  176,  176});
1330   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 176,    1,    1});
1331   /************************** Stage 4 **************************/
1332   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1333   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 352,    1,    1});
1334   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  352,  352});
1335   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  352,  352});
1336   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 352,    1,    1});
1337   /*************************** Conv 5 **************************/
1338   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1339   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  704, 1024});
1340 }
1341 
1342 // ShuffleNet v2 (2.0X scale)
ShuffleNetV2X20(benchmark::internal::Benchmark * b)1343 static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {
1344   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1345 
1346   /*************************** Conv 1 **************************/
1347   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1348   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1349   /************************** Stage 2 **************************/
1350   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1351   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1352   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,  122});
1353   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  122});
1354   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 122,    1,    1});
1355   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  122,  122});
1356   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 122,    1,    1});
1357   /************************** Stage 3 **************************/
1358   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1359   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 244,    1,    1});
1360   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  244,  244});
1361   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  244,  244});
1362   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 244,    1,    1});
1363   /************************** Stage 4 **************************/
1364   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1365   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 488,    1,    1});
1366   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  488,  488});
1367   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  488,  488});
1368   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 488,    1,    1});
1369   /*************************** Conv 5 **************************/
1370   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1371   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  976, 2048});
1372 }
1373 
MobileNetV1(benchmark::internal::Benchmark * b)1374 static void MobileNetV1(benchmark::internal::Benchmark* b) {
1375   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1376 
1377   /*       N   H    W   KH  KW  PH  PW  S  D    G   GCin  GCout */
1378   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,    1,    3,   32});
1379   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,   32,    1,    1});
1380   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,    1,   32,   64});
1381   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,   64,    1,    1});
1382   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,   64,  128});
1383   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  128,    1,    1});
1384   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,  128,  128});
1385   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  128,    1,    1});
1386   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  128,  256});
1387   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  256,    1,    1});
1388   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  256,  256});
1389   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  256,    1,    1});
1390   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  256,  512});
1391   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  512,    1,    1});
1392   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  512,  512});
1393   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  512,    1,    1});
1394   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1,  512, 1024});
1395   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1024,    1,    1});
1396   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1, 1024, 1024});
1397 }
1398 
MobileNetV2(benchmark::internal::Benchmark * b)1399 static void MobileNetV2(benchmark::internal::Benchmark* b) {
1400   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1401 
1402   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1403   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   32});
1404 
1405   /************************ Bottleneck 1 ***********************/
1406   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1407   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  32,    1,    1});
1408   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   32,   16});
1409 
1410   /************************ Bottleneck 2 ***********************/
1411   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1412   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   96});
1413   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  96,    1,    1});
1414   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   96,   24});
1415   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
1416   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 144,    1,    1});
1417   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,  144,   24});
1418 
1419   /************************ Bottleneck 3 ***********************/
1420   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1421 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
1422   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 144,    1,    1});
1423   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   32});
1424   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1425   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
1426   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
1427 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1428 //b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
1429 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
1430 
1431   /************************ Bottleneck 4 ***********************/
1432   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1433 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1434   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
1435   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  192,   64});
1436   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1437   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1438   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1439 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1440 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1441 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1442 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1443 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1444 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1445 
1446   /************************ Bottleneck 5 ***********************/
1447   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1448 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1449 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1450   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   96});
1451   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1452   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
1453   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1454 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1455 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
1456 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1457 
1458   /************************ Bottleneck 6 ***********************/
1459   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1460 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1461   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 576,    1,    1});
1462   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  160});
1463   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1464   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1465   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1466 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1467 //b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1468 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1469 
1470   /************************ Bottleneck 7 ***********************/
1471   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1472 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1473 //b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1474   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  320});
1475 
1476   /******************** Pre-pooling Conv2D *********************/
1477   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1478   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  320, 1280});
1479   /******************** Post-pooling Conv2D ********************/
1480   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1481   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1000});
1482 }
1483 
MobileNetV3Small(benchmark::internal::Benchmark * b)1484 static void MobileNetV3Small(benchmark::internal::Benchmark* b) {
1485   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1486 
1487   /*********************** Initial Stage ***********************/
1488   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1489   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
1490   /*********************** Bottleneck 1 ************************/
1491   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1492   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  16,    1,    1});
1493   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   16,    8});
1494   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,    8,   16});
1495   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   16});
1496   /*********************** Bottleneck 2 ************************/
1497   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1498   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   72});
1499   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  72,    1,    1});
1500   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1501   /*********************** Bottleneck 3 ************************/
1502   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1503   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
1504   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
1505   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   24});
1506   /*********************** Bottleneck 4 ************************/
1507   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1508   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   96});
1509   b->Args({1,  28,  28,  5,  5,  4,  4, 2, 1,  96,    1,    1});
1510   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   96,   24});
1511   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   96});
1512   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   40});
1513   /*********************** Bottleneck 5 ************************/
1514   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1515   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1516   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
1517   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
1518   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
1519   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
1520   /*********************** Bottleneck 6 ************************/
1521   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1522 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1523 //b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
1524 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
1525 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
1526 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
1527   /*********************** Bottleneck 7 ************************/
1528   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1529   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1530   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1531   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1532   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1533   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  120,   48});
1534   /*********************** Bottleneck 8 ************************/
1535   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1536   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  144});
1537   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 144,    1,    1});
1538   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,   40});
1539   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   40,  144});
1540   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  144,   48});
1541   /*********************** Bottleneck 9 ************************/
1542   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1543   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  288});
1544   b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 288,    1,    1});
1545   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  288,   72});
1546   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,  288});
1547   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  288,   96});
1548   /*********************** Bottleneck 10 ***********************/
1549   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1550   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1551   b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
1552   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
1553   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
1554   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1555   /*********************** Bottleneck 11 ***********************/
1556   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1557 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1558 //b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
1559 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
1560 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
1561 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1562   /************************ Last Stage  ************************/
1563   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1564 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1565   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576, 1024});
1566   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1024, 1001});
1567 }
1568 
MobileNetV3Large(benchmark::internal::Benchmark * b)1569 static void MobileNetV3Large(benchmark::internal::Benchmark* b) {
1570   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1571 
1572   /*********************** Initial Stage ***********************/
1573   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1574   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
1575   /*********************** Bottleneck 1 ************************/
1576   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1577   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  16,    1,    1});
1578   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   16});
1579   /*********************** Bottleneck 2 ************************/
1580   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1581   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   64});
1582   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  64,    1,    1});
1583   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   64,   24});
1584   /*********************** Bottleneck 3 ************************/
1585   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1586   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1587   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  72,    1,    1});
1588   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1589   /*********************** Bottleneck 4 ************************/
1590   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1591 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1592   b->Args({1,  56,  56,  5,  5,  4,  4, 2, 1,  72,    1,    1});
1593   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1594   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1595   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   40});
1596   /*********************** Bottleneck 5 ************************/
1597   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1598   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1599   b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1600   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1601   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1602   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
1603   /*********************** Bottleneck 6 ************************/
1604   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1605 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1606 //b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1607 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1608 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1609 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
1610   /*********************** Bottleneck 7 ************************/
1611   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1612   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1613   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 240,    1,    1});
1614   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   80});
1615   /*********************** Bottleneck 8 ************************/
1616   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1617   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  200});
1618   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 200,    1,    1});
1619   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  200,   80});
1620   /*********************** Bottleneck 9 ************************/
1621   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1622   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
1623   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
1624   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
1625   /********************** Bottleneck 10 ***********************/
1626   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1627 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
1628 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
1629 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
1630   /********************** Bottleneck 11 ***********************/
1631   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1632   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  480});
1633   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 480,    1,    1});
1634   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  480,  120});
1635   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,  480});
1636   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  480,  112});
1637   /********************** Bottleneck 12 ***********************/
1638   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1639   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
1640   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 672,    1,    1});
1641   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  672,  168});
1642   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  168,  672});
1643   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  672,  112});
1644   /********************** Bottleneck 13 ***********************/
1645   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1646 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
1647   b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 672,    1,    1});
1648   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  672,  160});
1649   /********************** Bottleneck 14 ***********************/
1650   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1651   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1652   b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
1653   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
1654   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
1655   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1656   /********************** Bottleneck 15 ***********************/
1657   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1658 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1659 //b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
1660 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
1661 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
1662 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1663   /************************ Last Stage  ***********************/
1664   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1665 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1666   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960, 1280});
1667   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1001});
1668 }
1669 
1670 // SqueezeNet 1.0
SqueezeNetV10(benchmark::internal::Benchmark * b)1671 static void SqueezeNetV10(benchmark::internal::Benchmark* b) {
1672   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1673 
1674   /************************** Conv 1 *************************/
1675   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1676   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   96});
1677   /************************** Fire 2 *************************/
1678   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1679   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   96,   16});
1680   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1681   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1682   /************************** Fire 3 *************************/
1683   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1684   b->Args({1,  56,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
1685 //b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1686 //b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1687   /************************** Fire 4 *************************/
1688   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1689   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   32});
1690   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1691   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1692   /************************** Fire 5 *************************/
1693   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1694   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
1695   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1696   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1697   /************************** Fire 6 *************************/
1698   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1699   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1700   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1701   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1702   /************************** Fire 7 *************************/
1703   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1704   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   48});
1705 //b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1706 //b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1707   /************************** Fire 8 *************************/
1708   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1709   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   64});
1710   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1711   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1712   /************************** Fire 9 *************************/
1713   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1714   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
1715   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1716   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1717   /************************* Conv 10 *************************/
1718   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1719   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
1720 }
1721 
1722 // SqueezeNet 1.1
SqueezeNetV11(benchmark::internal::Benchmark * b)1723 static void SqueezeNetV11(benchmark::internal::Benchmark* b) {
1724   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1725 
1726   /************************** Conv 1 *************************/
1727   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1728   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1, 1,    3,   64});
1729   /************************** Fire 2 *************************/
1730   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1731   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   64,   16});
1732   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1733   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1734   /************************** Fire 3 *************************/
1735   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1736   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
1737 //b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1738 //b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1739   /************************** Fire 4 *************************/
1740   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1741   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  128,   32});
1742   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1743   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1744   /************************** Fire 5 *************************/
1745   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1746   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
1747 //b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1748 //b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1749   /************************** Fire 6 *************************/
1750   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1751   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1752   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1753   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1754   /************************** Fire 7 *************************/
1755   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1756   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   48});
1757 //b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1758 //b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1759   /************************** Fire 8 *************************/
1760   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1761   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   64});
1762   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1763   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1764   /************************** Fire 9 *************************/
1765   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1766   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
1767 //b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1768 //b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1769   /************************* Conv 10 *************************/
1770   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1771   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
1772 }
1773 
InceptionV3(benchmark::internal::Benchmark * b)1774 static void InceptionV3(benchmark::internal::Benchmark* b) {
1775   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1776 
1777   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1778   b->Args({1, 299, 299,  3,  3,  0,  0, 2, 1, 1,    3,   32});
1779   b->Args({1, 149, 149,  3,  3,  0,  0, 1, 1, 1,   32,   32});
1780   b->Args({1, 147, 147,  3,  3,  2,  2, 1, 1, 1,   32,   64});
1781   b->Args({1,  73,  73,  1,  1,  0,  0, 1, 1, 1,   64,   80});
1782   b->Args({1,  73,  73,  3,  3,  0,  0, 1, 1, 1,   80,  192});
1783   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   64});
1784   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   48});
1785   b->Args({1,  35,  35,  5,  5,  4,  4, 1, 1, 1,   48,   64});
1786   b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   64,   96});
1787   b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   96,   96});
1788   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   32});
1789   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   64});
1790   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1791   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   64});
1792   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   48});
1793   b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,  288,  384});
1794   b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,   96,   96});
1795   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  192});
1796   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  128});
1797   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  128});
1798   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  192});
1799   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  128});
1800   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  192});
1801   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  160});
1802   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  160});
1803   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  192});
1804   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  160});
1805   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  192});
1806   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  192,  192});
1807   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  192,  192});
1808   b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  320});
1809   b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  192});
1810   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  320});
1811   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  384});
1812   b->Args({1,   8,   8,  1,  3,  0,  2, 1, 1, 1,  384,  384});
1813   b->Args({1,   8,   8,  3,  1,  2,  0, 1, 1, 1,  384,  384});
1814   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  448});
1815   b->Args({1,   8,   8,  3,  3,  2,  2, 1, 1, 1,  448,  384});
1816   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  192});
1817   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  320});
1818   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  384});
1819   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  448});
1820   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  192});
1821   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1, 1, 2048, 1001});
1822 }
1823 
ResNet18(benchmark::internal::Benchmark * b)1824 static void ResNet18(benchmark::internal::Benchmark* b) {
1825   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1826 
1827   /************************* Conv 1 *************************/
1828   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1829   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
1830   /************************ Conv 2.X ************************/
1831   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1832   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1833   /************************ Conv 3.X ************************/
1834   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1835   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,   64,  128});
1836   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1837   b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,   64,  128});
1838   /************************ Conv 4.X ************************/
1839   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1840   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  128,  256});
1841   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1842   b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  128,  256});
1843   /************************ Conv 5.X ************************/
1844   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1845   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  256,  512});
1846   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1847   b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1,  256,  512});
1848 }
1849 
ResNet50(benchmark::internal::Benchmark * b)1850 static void ResNet50(benchmark::internal::Benchmark* b) {
1851   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1852 
1853   /************************* Conv 1 *************************/
1854   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1855   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
1856   /************************ Conv 2.1 ************************/
1857   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1858   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,   64});
1859   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1860   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1861 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1862   /************************ Conv 2.X ************************/
1863   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1864   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,   64});
1865 //b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1866 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1867   /************************ Conv 3.1 ************************/
1868   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1869   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  128});
1870   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,  128,  128});
1871   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
1872   b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,  256,  512});
1873   /************************ Conv 3.X ************************/
1874   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1875   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  128});
1876   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1877 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
1878   /************************ Conv 4.1 ************************/
1879   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1880   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  256});
1881   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  256,  256});
1882   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
1883   b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  512, 1024});
1884   /************************ Conv 4.X ************************/
1885   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1886   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  256});
1887   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1888 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
1889   /************************ Conv 5.1 ************************/
1890   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1891   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  512});
1892   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  512,  512});
1893   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
1894   b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1, 1024, 2048});
1895   /************************ Conv 5.X ************************/
1896   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1897   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1, 2048,  512});
1898   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1899 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
1900 }
1901 
VGG(benchmark::internal::Benchmark * b)1902 static void VGG(benchmark::internal::Benchmark* b) {
1903   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1904 
1905   /************************* Conv 1.1 ************************/
1906   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1907   b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,    3,   64});
1908   /************************* Conv 1.2 ************************/
1909   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1910   b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1911 
1912   /************************* Conv 2.1 ************************/
1913   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1914   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,   64,  128});
1915   /************************* Conv 2.2 ************************/
1916   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1917   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1918 
1919   /************************* Conv 3.1 ************************/
1920   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1921   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  128,  256});
1922   /************************* Conv 3.2 ************************/
1923   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1924   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1925   /************************* Conv 3.3 ************************/
1926   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1927   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  256});
1928 
1929   /************************* Conv 4.1 ************************/
1930   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1931   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  256,  512});
1932   /************************* Conv 4.2 ************************/
1933   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1934   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1935   /************************* Conv 4.3 ************************/
1936   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1937   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  512});
1938 
1939   /************************* Conv 5.X ************************/
1940   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1941   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1942   /************************* Conv 5.3 ************************/
1943   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1944   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  512,  512});
1945 }
1946 
1947 // SRCNN (9-1-5)
SRCNN915(benchmark::internal::Benchmark * b)1948 static void SRCNN915(benchmark::internal::Benchmark* b) {
1949   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1950 
1951   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1952   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1953   b->Args({1, 376, 376,  1,  1,  0,  0, 1, 1, 1,   64,   32});
1954   b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1955 }
1956 
1957 // SRCNN (9-3-5)
SRCNN935(benchmark::internal::Benchmark * b)1958 static void SRCNN935(benchmark::internal::Benchmark* b) {
1959   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1960 
1961   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1962   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1963   b->Args({1, 376, 376,  3,  3,  0,  0, 1, 1, 1,   64,   32});
1964   b->Args({1, 374, 374,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1965 }
1966 
1967 // SRCNN (9-5-5)
SRCNN955(benchmark::internal::Benchmark * b)1968 static void SRCNN955(benchmark::internal::Benchmark* b) {
1969   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1970 
1971   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1972   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1973   b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   64,   32});
1974   b->Args({1, 372, 372,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1975 }
1976 
1977 #ifndef XNN_NO_F16_OPERATORS
1978   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1979   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1980   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1981   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1982   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1983   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1984   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1985   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1986   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1987   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1988   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1989   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1990   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1991   BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1992   BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1993   BENCHMARK_CAPTURE(xnnpack_convolution_f16, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1994   BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1995   BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1996   BENCHMARK_CAPTURE(xnnpack_convolution_f16, vgg, "VGG")->Apply(VGG)->UseRealTime();
1997   BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1998   BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1999   BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
2000 #endif  // XNN_NO_F16_OPERATORS
2001 
2002 #ifndef XNN_NO_F32_OPERATORS
2003   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
2004   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
2005   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
2006   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
2007   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
2008   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
2009   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
2010   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
2011   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
2012   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
2013   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
2014   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
2015   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
2016   BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
2017   BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
2018   BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
2019   BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
2020   BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
2021   BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
2022   BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
2023   BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
2024   BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
2025 #endif  // XNN_NO_F32_OPERATORS
2026 
2027 #ifndef XNN_NO_QS8_OPERATORS
2028   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
2029   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
2030   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
2031   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
2032   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
2033   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
2034   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
2035   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
2036   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
2037   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
2038   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
2039   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
2040   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
2041   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
2042   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
2043   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
2044   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
2045   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
2046   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, vgg, "VGG")->Apply(VGG)->UseRealTime();
2047   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
2048   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
2049   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
2050 #endif  // XNN_NO_QS8_OPERATORS
2051 
2052 #ifndef XNN_NO_QU8_OPERATORS
2053   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
2054   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
2055   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
2056   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
2057   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
2058   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
2059   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
2060   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
2061   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
2062   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
2063   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
2064   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
2065   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
2066   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
2067   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
2068   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
2069   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
2070   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
2071   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, vgg, "VGG")->Apply(VGG)->UseRealTime();
2072   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
2073   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
2074   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
2075 #endif  // XNN_NO_QU8_OPERATORS
2076 
2077 #ifdef BENCHMARK_TENSORFLOW_LITE
2078   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
2079   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
2080   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
2081   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
2082   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
2083   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
2084   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
2085   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
2086   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
2087   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
2088   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
2089   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
2090   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
2091   BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
2092   BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
2093   BENCHMARK_CAPTURE(tflite_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
2094   BENCHMARK_CAPTURE(tflite_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
2095   BENCHMARK_CAPTURE(tflite_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
2096   BENCHMARK_CAPTURE(tflite_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
2097   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
2098   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
2099   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
2100 #endif  // BENCHMARK_TENSORFLOW_LITE
2101 
2102 #ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
2103   BENCHMARK_CAPTURE(armcl_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
2104   BENCHMARK_CAPTURE(armcl_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
2105   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
2106   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
2107   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
2108   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
2109   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
2110   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
2111   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
2112   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
2113   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
2114   BENCHMARK_CAPTURE(armcl_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
2115   BENCHMARK_CAPTURE(armcl_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
2116   BENCHMARK_CAPTURE(armcl_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
2117   BENCHMARK_CAPTURE(armcl_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
2118   BENCHMARK_CAPTURE(armcl_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
2119   BENCHMARK_CAPTURE(armcl_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
2120   BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
2121   BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
2122   BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
2123 #endif  // BENCHMARK_ARM_COMPUTE_LIBRARY
2124 
2125 #ifndef XNNPACK_BENCHMARK_NO_MAIN
2126 BENCHMARK_MAIN();
2127 #endif
2128