• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <ostream>
14 #include <random>
15 #include <string>
16 #include <vector>
17 
18 #include <cpuinfo.h>
19 #include <xnnpack.h>
20 
21 #ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
22 #include "arm_compute/core/Types.h"
23 #include "arm_compute/runtime/Tensor.h"
24 #include "arm_compute/runtime/CPP/CPPScheduler.h"
25 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
26 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
27 #endif  // BENCHMARK_ARM_COMPUTE_LIBRARY
28 #include <benchmark/benchmark.h>
29 #ifdef BENCHMARK_TENSORFLOW_LITE
30 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
31 #include "tensorflow/lite/interpreter.h"
32 #include "tensorflow/lite/kernels/register.h"
33 #include "tensorflow/lite/model.h"
34 #include "tensorflow/lite/schema/schema_generated.h"
35 #include "tensorflow/lite/version.h"
36 #endif  // BENCHMARK_TENSORFLOW_LITE
37 #include "bench/utils.h"
38 
39 
xnnpack_convolution_q8(benchmark::State & state,const char * net)40 void xnnpack_convolution_q8(benchmark::State& state, const char* net) {
41   const size_t batch_size = state.range(0);
42   const size_t input_height = state.range(1);
43   const size_t input_width = state.range(2);
44   const size_t kernel_height = state.range(3);
45   const size_t kernel_width = state.range(4);
46   const size_t padding_height = state.range(5);
47   const size_t padding_width = state.range(6);
48   const size_t subsampling = state.range(7);
49   const size_t dilation = state.range(8);
50   const size_t groups = state.range(9);
51   const size_t group_input_channels = state.range(10);
52   const size_t group_output_channels = state.range(11);
53 
54   std::random_device random_device;
55   auto rng = std::mt19937(random_device());
56   auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
57   auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
58 
59   const size_t output_pixel_stride = groups * group_output_channels;
60   const size_t input_pixel_stride = groups * group_input_channels;
61   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
62   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
63   const size_t padding_left = padding_width / 2;
64   const size_t padding_top = padding_height / 2;
65   const size_t padding_right = padding_width - padding_left;
66   const size_t padding_bottom = padding_height - padding_top;
67   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
68   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
69 
70   std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
71   std::generate(input.begin(), input.end(), std::ref(u8rng));
72   std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
73   std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
74   std::vector<int32_t> bias(groups * group_output_channels);
75   std::generate(bias.begin(), bias.end(), std::ref(s32rng));
76   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
77 
78   xnn_status status = xnn_initialize(nullptr /* allocator */);
79   if (status != xnn_status_success) {
80     state.SkipWithError("failed to initialize XNNPACK");
81     return;
82   }
83 
84   if (!cpuinfo_initialize()) {
85     state.SkipWithError("cpuinfo initialization failed");
86     return;
87   }
88   const size_t num_buffers = 1 +
89     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
90       sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
91   std::vector<uint8_t> output(output_elements * num_buffers);
92 
93   std::vector<xnn_operator_t> convolution_operators(num_buffers);
94   for (xnn_operator_t& convolution_op : convolution_operators) {
95     status = xnn_create_convolution2d_nhwc_q8(
96       padding_top, padding_right, padding_bottom, padding_left,
97       kernel_height, kernel_width,
98       subsampling, subsampling,
99       dilation, dilation,
100       groups, group_input_channels, group_output_channels,
101       input_pixel_stride, output_pixel_stride,
102       127, 0.5f,
103       127, 0.5f,
104       kernel.data(), bias.data(),
105       127, 0.5f, 0, 255,
106       0 /* flags */, &convolution_op);
107     if (status != xnn_status_success) {
108       state.SkipWithError("failed to create QINT8 Convolution operator");
109       return;
110     }
111   }
112 
113   for (size_t i = 0; i < convolution_operators.size(); i++) {
114     status = xnn_setup_convolution2d_nhwc_q8(
115       convolution_operators[i],
116       batch_size, input_height, input_width,
117       input.data(), output.data() + i * output_elements,
118       nullptr /* thread pool */);
119     if (status != xnn_status_success) {
120       state.SkipWithError("failed to setup QINT8 Convolution operator");
121       return;
122     }
123   }
124 
125   size_t buffer_index = 0;
126   for (auto _ : state) {
127     state.PauseTiming();
128     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
129     buffer_index = (buffer_index + 1) % num_buffers;
130     state.ResumeTiming();
131 
132     status = xnn_run_operator(convolution_operators[buffer_index],
133       nullptr /* thread pool */);
134     if (status != xnn_status_success) {
135       state.SkipWithError("failed to run QINT8 Convolution operator");
136       return;
137     }
138   }
139 
140   for (xnn_operator_t& convolution_op : convolution_operators) {
141     status = xnn_delete_operator(convolution_op);
142     if (status != xnn_status_success) {
143       state.SkipWithError("failed to delete QINT8 Convolution operator");
144       return;
145     }
146     convolution_op = nullptr;
147   }
148 
149   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
150   state.counters["OPS"] = benchmark::Counter(
151     uint64_t(state.iterations()) * 2 *
152       batch_size * output_height * output_width *
153       groups * group_input_channels * group_output_channels *
154       kernel_height * kernel_width,
155     benchmark::Counter::kIsRate);
156 }
157 
xnnpack_convolution_f32(benchmark::State & state,const char * net)158 void xnnpack_convolution_f32(benchmark::State& state, const char* net) {
159   const size_t batch_size = state.range(0);
160   const size_t input_height = state.range(1);
161   const size_t input_width = state.range(2);
162   const size_t kernel_height = state.range(3);
163   const size_t kernel_width = state.range(4);
164   const size_t padding_height = state.range(5);
165   const size_t padding_width = state.range(6);
166   const size_t subsampling = state.range(7);
167   const size_t dilation = state.range(8);
168   const size_t groups = state.range(9);
169   const size_t group_input_channels = state.range(10);
170   const size_t group_output_channels = state.range(11);
171 
172   std::random_device random_device;
173   auto rng = std::mt19937(random_device());
174   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
175 
176   const size_t output_pixel_stride = groups * group_output_channels;
177   const size_t input_pixel_stride = groups * group_input_channels;
178   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
179   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
180   const size_t padding_left = padding_width / 2;
181   const size_t padding_top = padding_height / 2;
182   const size_t padding_right = padding_width - padding_left;
183   const size_t padding_bottom = padding_height - padding_top;
184   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
185   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
186 
187   std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
188   std::generate(input.begin(), input.end(), std::ref(f32rng));
189   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
190   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
191   std::vector<float> bias(groups * group_output_channels);
192   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
193   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
194 
195   xnn_status status = xnn_initialize(nullptr /* allocator */);
196   if (status != xnn_status_success) {
197     state.SkipWithError("failed to initialize XNNPACK");
198     return;
199   }
200 
201   if (!cpuinfo_initialize()) {
202     state.SkipWithError("cpuinfo initialization failed");
203     return;
204   }
205   const size_t num_buffers = 1 +
206     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
207       sizeof(float) * (kernel.size() + bias.size() + output_elements));
208   std::vector<float> output(output_elements * num_buffers);
209 
210   std::vector<xnn_operator_t> convolution_operators(num_buffers);
211   for (xnn_operator_t& convolution_op : convolution_operators) {
212     status = xnn_create_convolution2d_nhwc_f32(
213       padding_top, padding_right, padding_bottom, padding_left,
214       kernel_height, kernel_width,
215       subsampling, subsampling,
216       dilation, dilation,
217       groups, group_input_channels, group_output_channels,
218       input_pixel_stride, output_pixel_stride,
219       kernel.data(), bias.data(),
220       -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
221       0 /* flags */, &convolution_op);
222     if (status != xnn_status_success) {
223       state.SkipWithError("failed to create FP32 Convolution operator");
224       return;
225     }
226   }
227 
228   for (size_t i = 0; i < convolution_operators.size(); i++) {
229     status = xnn_setup_convolution2d_nhwc_f32(
230       convolution_operators[i],
231       batch_size, input_height, input_width,
232       input.data(), output.data() + i * output_elements,
233       nullptr /* thread pool */);
234     if (status != xnn_status_success) {
235       state.SkipWithError("failed to setup FP32 Convolution operator");
236       return;
237     }
238   }
239 
240   size_t buffer_index = 0;
241   for (auto _ : state) {
242     state.PauseTiming();
243     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
244     buffer_index = (buffer_index + 1) % num_buffers;
245     state.ResumeTiming();
246 
247     status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
248     if (status != xnn_status_success) {
249       state.SkipWithError("failed to run FP32 Convolution operator");
250       return;
251     }
252   }
253 
254   for (xnn_operator_t& convolution_op : convolution_operators) {
255     status = xnn_delete_operator(convolution_op);
256     if (status != xnn_status_success) {
257       state.SkipWithError("failed to delete FP32 Convolution operator");
258       return;
259     }
260     convolution_op = nullptr;
261   }
262 
263   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
264   state.counters["FLOPS"] = benchmark::Counter(
265     uint64_t(state.iterations()) * 2 *
266       batch_size * output_height * output_width *
267       groups * group_input_channels * group_output_channels *
268       kernel_height * kernel_width,
269     benchmark::Counter::kIsRate);
270 }
271 
272 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_convolution_f32(benchmark::State & state,const char * net)273 void tflite_convolution_f32(benchmark::State& state, const char* net) {
274   const size_t batch_size = state.range(0);
275   const size_t input_height = state.range(1);
276   const size_t input_width = state.range(2);
277   const size_t kernel_height = state.range(3);
278   const size_t kernel_width = state.range(4);
279   const size_t padding_height = state.range(5);
280   const size_t padding_width = state.range(6);
281   const size_t subsampling = state.range(7);
282   const size_t dilation = state.range(8);
283   const size_t groups = state.range(9);
284   const size_t group_input_channels = state.range(10);
285   const size_t group_output_channels = state.range(11);
286 
287   bool is_depthwise = false;
288   if (groups != 1) {
289     if (group_input_channels == 1) {
290       is_depthwise = true;
291     } else {
292       state.SkipWithError("grouped convolution is not supported");
293       return;
294     }
295   }
296 
297   std::random_device random_device;
298   auto rng = std::mt19937(random_device());
299   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
300 
301   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
302   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
303 
304   tflite::Padding padding = tflite::Padding_VALID;
305   if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {
306     padding = tflite::Padding_SAME;
307   } else if (padding_width == 0 && padding_height == 0) {
308     padding = tflite::Padding_VALID;
309   } else {
310     state.SkipWithError("unsupported padding");
311     return;
312   }
313 
314   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
315   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
316 
317   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
318   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
319   std::vector<float> bias(groups * group_output_channels);
320   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
321 
322   flatbuffers::FlatBufferBuilder builder;
323   flatbuffers::Offset<tflite::OperatorCode> operator_code =
324       CreateOperatorCode(
325         builder,
326         is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,
327         0);
328 
329   flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(
330       builder,
331       padding,
332       static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
333       tflite::ActivationFunctionType_NONE,
334       static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
335 
336   flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(
337       builder,
338       padding,
339       static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
340       static_cast<int32_t>(group_output_channels),
341       tflite::ActivationFunctionType_NONE,
342       static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
343 
344   flatbuffers::Offset<tflite::Buffer> buffers[3] = {
345     tflite::CreateBuffer(builder, builder.CreateVector({})),
346     tflite::CreateBuffer(builder, builder.CreateVector(
347       reinterpret_cast<const uint8_t*>(kernel.data()),
348       sizeof(float) * kernel.size())),
349     tflite::CreateBuffer(builder, builder.CreateVector(
350       reinterpret_cast<const uint8_t*>(bias.data()),
351       sizeof(float) * bias.size())),
352   };
353 
354   const int32_t input_shape[4] = {
355     static_cast<int32_t>(batch_size),
356     static_cast<int32_t>(input_height),
357     static_cast<int32_t>(input_width),
358     static_cast<int32_t>(groups * group_input_channels)
359   };
360   const int32_t output_shape[4] = {
361     static_cast<int32_t>(batch_size),
362     static_cast<int32_t>(output_height),
363     static_cast<int32_t>(output_width),
364     static_cast<int32_t>(groups * group_output_channels)
365   };
366   const int32_t filter_shape[4] = {
367     static_cast<int32_t>(group_output_channels),
368     static_cast<int32_t>(kernel_height),
369     static_cast<int32_t>(kernel_width),
370     static_cast<int32_t>(groups * group_input_channels)
371   };
372   const int32_t bias_shape[1] = {
373     static_cast<int32_t>(groups * group_output_channels)
374   };
375 
376   flatbuffers::Offset<tflite::Tensor> tensors[4] = {
377     tflite::CreateTensor(builder,
378                          builder.CreateVector<int32_t>(input_shape, 4),
379                          tflite::TensorType_FLOAT32,
380                          0 /* buffer id */,
381                          builder.CreateString("input")),
382     tflite::CreateTensor(builder,
383                          builder.CreateVector<int32_t>(filter_shape, 4),
384                          tflite::TensorType_FLOAT32,
385                          1 /* buffer id */,
386                          builder.CreateString("filter")),
387     tflite::CreateTensor(builder,
388                          builder.CreateVector<int32_t>(bias_shape, 1),
389                          tflite::TensorType_FLOAT32,
390                          2 /* buffer id */,
391                          builder.CreateString("bias")),
392     tflite::CreateTensor(builder,
393                          builder.CreateVector<int32_t>(output_shape, 4),
394                          tflite::TensorType_FLOAT32,
395                          0 /* buffer id */,
396                          builder.CreateString("output")),
397   };
398 
399   const int32_t op_inputs[3] = { 0, 1, 2 };
400   const int32_t op_outputs[1] = { 3 };
401   flatbuffers::Offset<tflite::Operator> op = CreateOperator(
402       builder,
403       0 /* opcode_index */,
404       builder.CreateVector<int32_t>(op_inputs, 3),
405       builder.CreateVector<int32_t>(op_outputs, 1),
406       is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,
407       is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),
408       /*custom_options */ 0,
409       tflite::CustomOptionsFormat_FLEXBUFFERS);
410 
411   const int32_t graph_inputs[1] = { 0 };
412   const int32_t graph_outputs[1] = { 3 };
413   flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
414       builder,
415       builder.CreateVector(tensors, 4),
416       builder.CreateVector<int32_t>(graph_inputs, 1),
417       builder.CreateVector<int32_t>(graph_outputs, 1),
418       builder.CreateVector(&op, 1),
419       builder.CreateString("Conv2D subgraph"));
420 
421   flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");
422 
423   flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
424       TFLITE_SCHEMA_VERSION,
425       builder.CreateVector(&operator_code, 1),
426       builder.CreateVector(&subgraph, 1),
427       description,
428       builder.CreateVector(buffers, 3));
429 
430   builder.Finish(model_buffer);
431 
432   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
433   tflite::ops::builtin::BuiltinOpResolver resolver;
434   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
435   std::unique_ptr<tflite::Interpreter> interpreter;
436   if (interpreterBuilder(&interpreter) != kTfLiteOk) {
437     state.SkipWithError("failed to create TFLite interpreter");
438     return;
439   }
440   if (interpreter == nullptr) {
441     state.SkipWithError("TFLite interpreter is null");
442     return;
443   }
444   interpreter->SetNumThreads(1);
445 
446   if (interpreter->AllocateTensors() != kTfLiteOk) {
447     state.SkipWithError("failed to allocate tensors");
448     return;
449   }
450 
451   std::generate(
452     interpreter->typed_tensor<float>(0),
453     interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,
454     std::ref(f32rng));
455 
456   for (auto _ : state) {
457     state.PauseTiming();
458     benchmark::utils::WipeCache();
459     benchmark::utils::PrefetchToL1(
460       interpreter->typed_tensor<float>(0),
461       batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
462     state.ResumeTiming();
463 
464     if (interpreter->Invoke() != kTfLiteOk) {
465       state.SkipWithError("failed to invoke TFLite interpreter");
466       return;
467     }
468   }
469 
470   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
471   state.counters["FLOPS"] = benchmark::Counter(
472     uint64_t(state.iterations()) * 2 *
473       batch_size * output_height * output_width *
474       groups * group_input_channels * group_output_channels *
475       kernel_height * kernel_width,
476     benchmark::Counter::kIsRate);
477 
478   interpreter.reset();
479 }
480 #endif  // BENCHMARK_TENSORFLOW_LITE
481 
482 #ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
compare_with_convolution_f32_reference_output(const benchmark::State & state,const float * input,size_t input_size,const float * kernel,size_t kernel_size,const float * bias,size_t bias_size,const float * output,size_t output_size)483 static std::string compare_with_convolution_f32_reference_output(
484     const benchmark::State& state, const float* input, size_t input_size,
485     const float* kernel, size_t kernel_size, const float* bias, size_t bias_size,
486     const float* output, size_t output_size)
487 {
488   const size_t batch_size = state.range(0);
489   const size_t input_height = state.range(1);
490   const size_t input_width = state.range(2);
491   const size_t kernel_height = state.range(3);
492   const size_t kernel_width = state.range(4);
493   const size_t padding_height = state.range(5);
494   const size_t padding_width = state.range(6);
495   const size_t subsampling = state.range(7);
496   const size_t dilation = state.range(8);
497   const size_t groups = state.range(9);
498   const size_t group_input_channels = state.range(10);
499   const size_t group_output_channels = state.range(11);
500 
501   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
502   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
503   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
504   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
505   const size_t input_pixel_stride = groups * group_input_channels;
506   const size_t padding_left = padding_width / 2;
507   const size_t padding_top = padding_height / 2;
508 
509   assert(input_size == batch_size * input_height * input_width * groups * group_input_channels);
510 
511   assert(kernel_size == group_output_channels * kernel_height * kernel_width * groups * group_input_channels);
512 
513   assert(bias_size == groups * group_output_channels);
514 
515   assert(output_size == batch_size * output_height * output_width * groups * group_output_channels);
516 
517   std::vector<float> output_ref(output_size);
518   for (size_t i = 0; i < batch_size; i++) {
519     for (size_t oy = 0; oy < output_height; oy++) {
520       for (size_t ox = 0; ox < output_width; ox++) {
521         for (size_t g = 0; g < groups; g++) {
522           for (size_t oc = 0; oc < group_output_channels; oc++) {
523             output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] =
524               bias[g * group_output_channels + oc];
525           }
526         }
527       }
528     }
529   }
530   for (size_t i = 0; i < batch_size; i++) {
531     for (size_t oy = 0; oy < output_height; oy++) {
532       for (size_t ox = 0; ox < output_width; ox++) {
533         for (size_t ky = 0; ky < kernel_height; ky++) {
534           const size_t iy = oy * subsampling + ky * dilation - padding_top;
535           if (iy < input_height) {
536             for (size_t kx = 0; kx < kernel_width; kx++) {
537               const size_t ix = ox * subsampling + kx * dilation - padding_left;
538               if (ix < input_width) {
539                 for (size_t g = 0; g < groups; g++) {
540                   for (size_t oc = 0; oc < group_output_channels; oc++) {
541                     for (size_t ic = 0; ic < group_input_channels; ic++) {
542                       output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] +=
543                         input[((i * input_height + iy) * input_width + ix) * input_pixel_stride + g * group_input_channels + ic] *
544                         kernel[(((oc * kernel_height + ky) * kernel_width + kx) * groups + g) * group_input_channels + ic];
545                     }  // group_input_channels loop
546                   }  // group_output_channels loop
547                 }  // groups loop
548               }
549             }  // kernel_width loop
550           }
551         }  // kernel_height loop
552       }  // output_width loop
553     }  // output_height loop
554   }  // batch_size loop
555 
556   const float relative_error_tolerance = 1e-4;
557   for (size_t i = 0; i < batch_size; i++) {
558     for (size_t y = 0; y < output_height; y++) {
559       for (size_t x = 0; x < output_width; x++) {
560         for (size_t g = 0; g < groups; g++) {
561           for (size_t c = 0; c < group_output_channels; c++) {
562             const size_t idx = (((i * output_height + y) * output_width + x) * groups + g) * group_output_channels + c;
563             const float value_ref = output_ref[idx];
564             const float value = output[idx];
565             if (std::abs(value - value_ref) > std::max(std::abs(value_ref) * relative_error_tolerance, std::numeric_limits<float>::epsilon())) {
566               std::ostringstream error_stream;
567               error_stream << "(x, y) = (" << x << ", " << y << "), group = " << g
568                        << ", channel = " << c << ", refValue = " << value_ref
569                        << ", actualValue = " << value
570                        << ", absDiff=" << std::abs(value - value_ref);
571               return error_stream.str();
572             }
573           }
574         }
575       }
576     }
577   }
578   return "";
579 }
580 
armcl_convolution_f32(benchmark::State & state,const char * net)581 void armcl_convolution_f32(benchmark::State& state, const char* net) {
582   const size_t batch_size = state.range(0);
583   const size_t input_height = state.range(1);
584   const size_t input_width = state.range(2);
585   const size_t kernel_height = state.range(3);
586   const size_t kernel_width = state.range(4);
587   const size_t padding_height = state.range(5);
588   const size_t padding_width = state.range(6);
589   const size_t subsampling = state.range(7);
590   const size_t dilation = state.range(8);
591   const size_t groups = state.range(9);
592   const size_t group_input_channels = state.range(10);
593   const size_t group_output_channels = state.range(11);
594 
595   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
596   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
597   const size_t padding_left = padding_width / 2;
598   const size_t padding_top = padding_height / 2;
599   const size_t padding_right = padding_width - padding_left;
600   const size_t padding_bottom = padding_height - padding_top;
601   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
602   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
603 
604   arm_compute::PadStrideInfo pad_stride_info(
605     subsampling /* stride height */,
606     subsampling /* stride width */,
607     padding_left, padding_right, padding_top, padding_bottom,
608     arm_compute::DimensionRoundingType::FLOOR);
609   arm_compute::Size2D dilation_info(dilation, dilation);
610   // Note: activation is disabled by default.
611   arm_compute::ActivationLayerInfo activation_info;
612 
613   // Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.
614   arm_compute::TensorShape input_shape(
615     /* C */ groups * group_input_channels,
616     /* W */ input_width,
617     /* H */ input_height,
618     /* N */ batch_size);
619   arm_compute::TensorInfo input_info(
620     input_shape,
621     1 /* number of channels per element (!) */,
622     arm_compute::DataType::F32);
623   input_info.set_data_layout(arm_compute::DataLayout::NHWC);
624   arm_compute::Tensor input_tensor;
625   input_tensor.allocator()->init(input_info);
626   input_tensor.allocator()->allocate();
627 
628   // Note: reverse order of dimensions, i.e. for IWHO for OHWI.
629   arm_compute::TensorShape kernel_shape(
630     /* I */ groups * group_input_channels,
631     /* W */ kernel_width,
632     /* H */ kernel_height,
633     /* O */ group_output_channels);
634   arm_compute::TensorInfo kernel_info(
635     kernel_shape,
636     1 /* number of channels per element (!) */,
637     arm_compute::DataType::F32);
638   kernel_info.set_data_layout(arm_compute::DataLayout::NHWC);
639   arm_compute::Tensor kernelTensor;
640   kernelTensor.allocator()->init(kernel_info);
641   kernelTensor.allocator()->allocate();
642 
643   arm_compute::TensorShape bias_shape(groups * group_output_channels);
644   arm_compute::TensorInfo bias_info(
645     bias_shape,
646     1 /* number of channels per element (!) */,
647     arm_compute::DataType::F32);
648   bias_info.set_data_layout(arm_compute::DataLayout::NHWC);
649   arm_compute::Tensor bias_tensor;
650   bias_tensor.allocator()->init(bias_info);
651   bias_tensor.allocator()->allocate();
652 
653   // Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.
654   arm_compute::TensorShape output_shape(
655     /* C */ groups * group_output_channels,
656     /* W */ output_width,
657     /* H */ output_height,
658     /* N */ batch_size);
659   arm_compute::TensorInfo output_info(
660     output_shape,
661     1 /* number of channels per element (!) */,
662     arm_compute::DataType::F32);
663   output_info.set_data_layout(arm_compute::DataLayout::NHWC);
664   arm_compute::Tensor output_tensor;
665   output_tensor.allocator()->init(output_info);
666   output_tensor.allocator()->allocate();
667 
668   std::random_device random_device;
669   auto rng = std::mt19937(random_device());
670   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
671 
672   std::generate(
673     reinterpret_cast<float*>(input_tensor.buffer()),
674     reinterpret_cast<float*>(input_tensor.buffer()) + input_shape.total_size(),
675     std::ref(f32rng));
676   std::generate(
677     reinterpret_cast<float*>(kernelTensor.buffer()),
678     reinterpret_cast<float*>(kernelTensor.buffer()) + kernel_shape.total_size(),
679     std::ref(f32rng));
680   std::generate(
681     reinterpret_cast<float*>(bias_tensor.buffer()),
682     reinterpret_cast<float*>(bias_tensor.buffer()) + bias_shape.total_size(),
683     std::ref(f32rng));
684   std::generate(
685     reinterpret_cast<float*>(output_tensor.buffer()),
686     reinterpret_cast<float*>(output_tensor.buffer()) + output_shape.total_size(),
687     std::ref(f32rng));
688 
689   bool is_depthwise = false;
690   if (groups != 1) {
691     // NEConvolutionLayer uses NEGEMMConvolutionLayer by default, which doesn't support grouped convolution.
692     // However, depthwise convolution is supported via NEDepthwiseConvolutionLayer.
693     if (group_input_channels == 1) {
694       is_depthwise = true;
695     } else {
696       state.SkipWithError("grouped convolution is not supported");
697       return;
698     }
699   }
700 
701   std::shared_ptr<arm_compute::IFunction> layer;
702   if (is_depthwise) {
703     if (dilation != 1) {
704       state.SkipWithError("dilated depthwise convolution is not supported");
705       return;
706     }
707 
708     // Avoid NEDepthwiseConvolutionLayer3x3 when stride isn't 2 in order to pass the output verification.
709     // TODO(b/130206370) This looks like a bug and needs further investigation.
710     if (kernel_height == 3 && kernel_width == 3 && subsampling == 2) {
711       auto* depthwise_3x3_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer3x3();
712       layer.reset(depthwise_3x3_convolution_layer);
713       depthwise_3x3_convolution_layer->configure(
714         &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
715         pad_stride_info, group_output_channels, activation_info);
716 
717       if (!depthwise_3x3_convolution_layer->validate(
718         &input_info, &kernel_info, &bias_info, &output_info,
719         pad_stride_info, group_output_channels, activation_info))
720       {
721         state.SkipWithError("validation failed");
722         return;
723       }
724     } else {
725       auto* depthwise_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer();
726       layer.reset(depthwise_convolution_layer);
727       depthwise_convolution_layer->configure(
728         &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
729         pad_stride_info, group_output_channels, activation_info);
730 
731       if (!depthwise_convolution_layer->validate(
732         &input_info, &kernel_info, &bias_info, &output_info,
733         pad_stride_info, group_output_channels, activation_info))
734       {
735         state.SkipWithError("validation failed");
736         return;
737       }
738     }
739   } else {
740     auto* convolution_layer = new arm_compute::NEConvolutionLayer();
741     layer.reset(convolution_layer);
742     convolution_layer->configure(
743       &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
744       pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,
745       true /* enable fast math */, groups);
746 
747     if (!convolution_layer->validate(
748       &input_info, &kernel_info, &bias_info, &output_info,
749       pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,
750       true /* enable fast math */, groups))
751     {
752       state.SkipWithError("validation failed");
753       return;
754     }
755   }
756 
757   // Dry run to let ACL do one-time initializations.
758   arm_compute::CPPScheduler::get().set_num_threads(1);
759   layer->run();
760 
761   for (auto _ : state) {
762     state.PauseTiming();
763     benchmark::utils::WipeCache();
764     benchmark::utils::PrefetchToL1(
765       input_tensor.buffer(),
766       batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
767     state.ResumeTiming();
768 
769     layer->run();
770   }
771 
772   // Validate outputs.
773   const std::string error_string = compare_with_convolution_f32_reference_output(
774       state, reinterpret_cast<const float*>(input_tensor.buffer()),
775       input_shape.total_size(),
776       reinterpret_cast<const float*>(kernelTensor.buffer()),
777       kernel_shape.total_size(),
778       reinterpret_cast<const float*>(bias_tensor.buffer()),
779       bias_shape.total_size(),
780       reinterpret_cast<const float*>(output_tensor.buffer()),
781       output_shape.total_size());
782 
783   if (!error_string.empty()) {
784     state.SkipWithError(("validation failed: " + error_string).c_str());
785     return;
786   }
787 
788   input_tensor.allocator()->free();
789   kernelTensor.allocator()->free();
790   bias_tensor.allocator()->free();
791   output_tensor.allocator()->free();
792 
793   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
794   state.counters["FLOPS"] = benchmark::Counter(
795     uint64_t(state.iterations()) * 2 *
796       batch_size * output_height * output_width *
797       groups * group_input_channels * group_output_channels *
798       kernel_height * kernel_width,
799     benchmark::Counter::kIsRate);
800 }
801 #endif  // BENCHMARK_ARM_COMPUTE_LIBRARY
802 
803 // ShuffleNet v1 with 1 group.
ShuffleNetV1G1(benchmark::internal::Benchmark * b)804 static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
805   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
806 
807   /*************************** Conv 1 **************************/
808   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
809   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
810   /******************* Stage 2: stride-2 unit ******************/
811   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
812   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   36});
813   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  36,    1,    1});
814   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  120});
815   /******************* Stage 2: stride-1 units *****************/
816   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
817   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   36});
818   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  36,    1,    1});
819   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  144});
820   /******************* Stage 3: stride-2 unit ******************/
821   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
822   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   72});
823   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  72,    1,    1});
824   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  144});
825   /******************* Stage 3: stride-1 units *****************/
826   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
827   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,   72});
828   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  72,    1,    1});
829   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  288});
830   /******************* Stage 4: stride-2 unit ******************/
831   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
832   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,  144});
833   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 144,    1,    1});
834   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  288});
835   /******************* Stage 4: stride-1 units *****************/
836   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
837   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  144});
838   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 144,    1,    1});
839   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  576});
840 }
841 
842 // ShuffleNet v1 with 2 groups.
ShuffleNetV1G2(benchmark::internal::Benchmark * b)843 static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
844   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
845 
846   /*************************** Conv 1 **************************/
847   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
848   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
849   /******************* Stage 2: stride-2 unit ******************/
850   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
851   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   50});
852   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  50,    1,    1});
853   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,   88});
854   /******************* Stage 2: stride-1 units *****************/
855   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
856   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   25});
857   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  50,    1,    1});
858   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,  100});
859   /******************* Stage 3: stride-2 unit ******************/
860   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
861   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   50});
862   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 100,    1,    1});
863   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  100});
864   /******************* Stage 3: stride-1 units *****************/
865   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
866   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,   50});
867   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 100,    1,    1});
868   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  200});
869   /******************* Stage 4: stride-2 unit ******************/
870   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
871   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,  100});
872   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 200,    1,    1});
873   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  200});
874   /******************* Stage 4: stride-1 units *****************/
875   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
876   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  400,  100});
877   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 200,    1,    1});
878   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  400});
879 }
880 
881 // ShuffleNet v1 with 3 groups.
ShuffleNetV1G3(benchmark::internal::Benchmark * b)882 static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
883   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
884 
885   /*************************** Conv 1 **************************/
886   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
887   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
888   /******************* Stage 2: stride-2 unit ******************/
889   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
890   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   60});
891   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  60,    1,    1});
892   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   72});
893   /******************* Stage 2: stride-1 units *****************/
894   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
895   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   20});
896   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  60,    1,    1});
897   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   80});
898   /******************* Stage 3: stride-2 unit ******************/
899   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
900   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   40});
901   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 120,    1,    1});
902   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,   80});
903   /******************* Stage 3: stride-1 units *****************/
904   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
905   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   40});
906   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 120,    1,    1});
907   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,  160});
908   /******************* Stage 4: stride-2 unit ******************/
909   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
910   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   80});
911   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 240,    1,    1});
912   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  160});
913   /******************* Stage 4: stride-1 units *****************/
914   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
915   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,  320,   80});
916   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 240,    1,    1});
917   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  320});
918 }
919 
920 // ShuffleNet v1 with 4 groups.
ShuffleNetV1G4(benchmark::internal::Benchmark * b)921 static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
922   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
923 
924   /*************************** Conv 1 **************************/
925   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
926   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
927   /******************* Stage 2: stride-2 unit ******************/
928   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
929   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   68});
930   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  68,    1,    1});
931   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   62});
932   /******************* Stage 2: stride-1 units *****************/
933   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
934   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   17});
935   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  68,    1,    1});
936   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   68});
937   /******************* Stage 3: stride-2 unit ******************/
938   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
939   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   34});
940   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 136,    1,    1});
941   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,   68});
942   /******************* Stage 3: stride-1 units *****************/
943   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
944   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   34});
945   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 136,    1,    1});
946   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,  136});
947   /******************* Stage 4: stride-2 unit ******************/
948   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
949   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   68});
950   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 272,    1,    1});
951   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  136});
952   /******************* Stage 4: stride-1 units *****************/
953   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
954   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,  272,   68});
955   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 272,    1,    1});
956   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  272});
957 }
958 
959 // ShuffleNet v1 with 8 groups.
ShuffleNetV1G8(benchmark::internal::Benchmark * b)960 static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
961   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
962 
963   /*************************** Conv 1 **************************/
964   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
965   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
966   /******************* Stage 2: stride-2 unit ******************/
967   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
968   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   96});
969   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  96,    1,    1});
970   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   45});
971   /******************* Stage 2: stride-1 units *****************/
972   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
973   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   12});
974   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  96,    1,    1});
975   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   48});
976   /******************* Stage 3: stride-2 unit ******************/
977   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
978   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   24});
979   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
980   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   48});
981   /******************* Stage 3: stride-1 units *****************/
982   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
983   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   24});
984   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 192,    1,    1});
985   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   96});
986   /******************* Stage 4: stride-2 unit ******************/
987   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
988   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   48});
989   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 384,    1,    1});
990   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,   96});
991   /******************* Stage 4: stride-1 units *****************/
992   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
993   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,  192,   48});
994   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 384,    1,    1});
995   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,  192});
996 }
997 
998 // ShuffleNet v2 (0.5X scale)
ShuffleNetV2X05(benchmark::internal::Benchmark * b)999 static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {
1000   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1001 
1002   /*************************** Conv 1 **************************/
1003   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1004   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1005   /************************** Stage 2 **************************/
1006   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1007   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1008   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   24});
1009   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   24});
1010   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  24,    1,    1});
1011   /************************** Stage 3 **************************/
1012   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1013   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  48,    1,    1});
1014   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,   48});
1015   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   48,   48});
1016   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  48,    1,    1});
1017   /************************** Stage 4 **************************/
1018   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1019   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  96,    1,    1});
1020   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,   96});
1021   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   96});
1022   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1,  96,    1,    1});
1023   /*************************** Conv 5 **************************/
1024   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1025   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  192, 1024});
1026 }
1027 
1028 // ShuffleNet v2 (1.0X scale)
ShuffleNetV2X10(benchmark::internal::Benchmark * b)1029 static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {
1030   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1031 
1032   /*************************** Conv 1 **************************/
1033   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1034   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1035   /************************** Stage 2 **************************/
1036   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1037   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1038   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   58});
1039   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   58});
1040   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  58,    1,    1});
1041   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   58,   58});
1042   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  58,    1,    1});
1043   /************************** Stage 3 **************************/
1044   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1045   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 116,    1,    1});
1046   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  116,  116});
1047   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  116,  116});
1048   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 116,    1,    1});
1049   /************************** Stage 4 **************************/
1050   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1051   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 232,    1,    1});
1052   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  232,  232});
1053   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  232,  232});
1054   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 232,    1,    1});
1055   /*************************** Conv 5 **************************/
1056   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1057   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  464, 1024});
1058 }
1059 
1060 // ShuffleNet v2 (1.5X scale)
ShuffleNetV2X15(benchmark::internal::Benchmark * b)1061 static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {
1062   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1063 
1064   /*************************** Conv 1 **************************/
1065   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1066   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1067   /************************** Stage 2 **************************/
1068   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1069   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1070   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
1071   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   88});
1072   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  88,    1,    1});
1073   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   88});
1074   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
1075   /************************** Stage 3 **************************/
1076   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1077   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 176,    1,    1});
1078   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  176,  176});
1079   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  176,  176});
1080   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 176,    1,    1});
1081   /************************** Stage 4 **************************/
1082   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1083   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 352,    1,    1});
1084   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  352,  352});
1085   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  352,  352});
1086   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 352,    1,    1});
1087   /*************************** Conv 5 **************************/
1088   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1089   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  704, 1024});
1090 }
1091 
1092 // ShuffleNet v2 (2.0X scale)
ShuffleNetV2X20(benchmark::internal::Benchmark * b)1093 static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {
1094   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1095 
1096   /*************************** Conv 1 **************************/
1097   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1098   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1099   /************************** Stage 2 **************************/
1100   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1101   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1102   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,  122});
1103   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  122});
1104   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 122,    1,    1});
1105   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  122,  122});
1106   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 122,    1,    1});
1107   /************************** Stage 3 **************************/
1108   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1109   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 244,    1,    1});
1110   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  244,  244});
1111   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  244,  244});
1112   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 244,    1,    1});
1113   /************************** Stage 4 **************************/
1114   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1115   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 488,    1,    1});
1116   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  488,  488});
1117   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  488,  488});
1118   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 488,    1,    1});
1119   /*************************** Conv 5 **************************/
1120   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1121   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  976, 2048});
1122 }
1123 
MobileNetV1(benchmark::internal::Benchmark * b)1124 static void MobileNetV1(benchmark::internal::Benchmark* b) {
1125   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1126 
1127   /*       N   H    W   KH  KW  PH  PW  S  D    G   GCin  GCout */
1128   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,    1,    3,   32});
1129   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,   32,    1,    1});
1130   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,    1,   32,   64});
1131   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,   64,    1,    1});
1132   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,   64,  128});
1133   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  128,    1,    1});
1134   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,  128,  128});
1135   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  128,    1,    1});
1136   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  128,  256});
1137   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  256,    1,    1});
1138   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  256,  256});
1139   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  256,    1,    1});
1140   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  256,  512});
1141   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  512,    1,    1});
1142   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  512,  512});
1143   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  512,    1,    1});
1144   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1,  512, 1024});
1145   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1024,    1,    1});
1146   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1, 1024, 1024});
1147 }
1148 
MobileNetV2(benchmark::internal::Benchmark * b)1149 static void MobileNetV2(benchmark::internal::Benchmark* b) {
1150   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1151 
1152   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1153   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   32});
1154 
1155   /************************ Bottleneck 1 ***********************/
1156   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1157   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  32,    1,    1});
1158   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   32,   16});
1159 
1160   /************************ Bottleneck 2 ***********************/
1161   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1162   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   96});
1163   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  96,    1,    1});
1164   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   96,   24});
1165   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
1166   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 144,    1,    1});
1167   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,  144,   24});
1168 
1169   /************************ Bottleneck 3 ***********************/
1170   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1171 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
1172   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 144,    1,    1});
1173   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   32});
1174   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1175   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
1176   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
1177 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1178 //b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
1179 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
1180 
1181   /************************ Bottleneck 4 ***********************/
1182   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1183 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1184   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
1185   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  192,   64});
1186   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1187   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1188   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1189 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1190 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1191 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1192 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1193 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1194 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1195 
1196   /************************ Bottleneck 5 ***********************/
1197   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1198 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1199 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1200   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   96});
1201   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1202   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
1203   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1204 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1205 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
1206 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1207 
1208   /************************ Bottleneck 6 ***********************/
1209   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1210 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1211   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 576,    1,    1});
1212   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  160});
1213   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1214   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1215   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1216 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1217 //b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1218 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1219 
1220   /************************ Bottleneck 7 ***********************/
1221   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1222 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1223 //b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1224   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  320});
1225 
1226   /******************** Pre-pooling Conv2D *********************/
1227   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1228   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  320, 1280});
1229   /******************** Post-pooling Conv2D ********************/
1230   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1231   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1000});
1232 }
1233 
MobileNetV3Small(benchmark::internal::Benchmark * b)1234 static void MobileNetV3Small(benchmark::internal::Benchmark* b) {
1235   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1236 
1237   /*********************** Initial Stage ***********************/
1238   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1239   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
1240   /*********************** Bottleneck 1 ************************/
1241   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1242   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  16,    1,    1});
1243   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   16,    8});
1244   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,    8,   16});
1245   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   16});
1246   /*********************** Bottleneck 2 ************************/
1247   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1248   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   72});
1249   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  72,    1,    1});
1250   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1251   /*********************** Bottleneck 3 ************************/
1252   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1253   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
1254   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
1255   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   24});
1256   /*********************** Bottleneck 4 ************************/
1257   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1258   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   96});
1259   b->Args({1,  28,  28,  5,  5,  4,  4, 2, 1,  96,    1,    1});
1260   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   96,   24});
1261   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   96});
1262   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   40});
1263   /*********************** Bottleneck 5 ************************/
1264   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1265   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1266   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
1267   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
1268   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
1269   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
1270   /*********************** Bottleneck 6 ************************/
1271   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1272 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1273 //b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
1274 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
1275 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
1276 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
1277   /*********************** Bottleneck 7 ************************/
1278   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1279   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1280   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1281   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1282   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1283   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  120,   48});
1284   /*********************** Bottleneck 8 ************************/
1285   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1286   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  144});
1287   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 144,    1,    1});
1288   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,   40});
1289   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   40,  144});
1290   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  144,   48});
1291   /*********************** Bottleneck 9 ************************/
1292   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1293   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  288});
1294   b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 288,    1,    1});
1295   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  288,   72});
1296   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,  288});
1297   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  288,   96});
1298   /*********************** Bottleneck 10 ***********************/
1299   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1300   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1301   b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
1302   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
1303   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
1304   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1305   /*********************** Bottleneck 11 ***********************/
1306   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1307 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1308 //b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
1309 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
1310 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
1311 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1312   /************************ Last Stage  ************************/
1313   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1314 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1315   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576, 1024});
1316   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1024, 1001});
1317 }
1318 
MobileNetV3Large(benchmark::internal::Benchmark * b)1319 static void MobileNetV3Large(benchmark::internal::Benchmark* b) {
1320   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1321 
1322   /*********************** Initial Stage ***********************/
1323   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1324   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
1325   /*********************** Bottleneck 1 ************************/
1326   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1327   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  16,    1,    1});
1328   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   16});
1329   /*********************** Bottleneck 2 ************************/
1330   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1331   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   64});
1332   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  64,    1,    1});
1333   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   64,   24});
1334   /*********************** Bottleneck 3 ************************/
1335   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1336   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1337   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  72,    1,    1});
1338   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1339   /*********************** Bottleneck 4 ************************/
1340   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1341 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1342   b->Args({1,  56,  56,  5,  5,  4,  4, 2, 1,  72,    1,    1});
1343   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1344   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1345   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   40});
1346   /*********************** Bottleneck 5 ************************/
1347   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1348   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1349   b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1350   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1351   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1352   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
1353   /*********************** Bottleneck 6 ************************/
1354   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1355 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1356 //b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1357 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1358 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1359 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
1360   /*********************** Bottleneck 7 ************************/
1361   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1362   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1363   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 240,    1,    1});
1364   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   80});
1365   /*********************** Bottleneck 8 ************************/
1366   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1367   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  200});
1368   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 200,    1,    1});
1369   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  200,   80});
1370   /*********************** Bottleneck 9 ************************/
1371   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1372   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
1373   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
1374   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
1375   /********************** Bottleneck 10 ***********************/
1376   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1377 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
1378 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
1379 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
1380   /********************** Bottleneck 11 ***********************/
1381   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1382   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  480});
1383   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 480,    1,    1});
1384   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  480,  120});
1385   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,  480});
1386   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  480,  112});
1387   /********************** Bottleneck 12 ***********************/
1388   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1389   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
1390   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 672,    1,    1});
1391   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  672,  168});
1392   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  168,  672});
1393   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  672,  112});
1394   /********************** Bottleneck 13 ***********************/
1395   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1396 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
1397   b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 672,    1,    1});
1398   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  672,  160});
1399   /********************** Bottleneck 14 ***********************/
1400   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1401   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1402   b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
1403   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
1404   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
1405   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1406   /********************** Bottleneck 15 ***********************/
1407   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1408 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1409 //b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
1410 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
1411 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
1412 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1413   /************************ Last Stage  ***********************/
1414   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1415 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1416   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960, 1280});
1417   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1001});
1418 }
1419 
1420 // SqueezeNet 1.0
SqueezeNetV10(benchmark::internal::Benchmark * b)1421 static void SqueezeNetV10(benchmark::internal::Benchmark* b) {
1422   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1423 
1424   /************************** Conv 1 *************************/
1425   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1426   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   96});
1427   /************************** Fire 2 *************************/
1428   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1429   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   96,   16});
1430   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1431   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1432   /************************** Fire 3 *************************/
1433   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1434   b->Args({1,  56,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
1435 //b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1436 //b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1437   /************************** Fire 4 *************************/
1438   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1439   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   32});
1440   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1441   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1442   /************************** Fire 5 *************************/
1443   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1444   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
1445   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1446   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1447   /************************** Fire 6 *************************/
1448   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1449   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1450   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1451   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1452   /************************** Fire 7 *************************/
1453   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1454   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   48});
1455 //b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1456 //b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1457   /************************** Fire 8 *************************/
1458   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1459   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   64});
1460   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1461   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1462   /************************** Fire 9 *************************/
1463   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1464   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
1465   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1466   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1467   /************************* Conv 10 *************************/
1468   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1469   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
1470 }
1471 
1472 // SqueezeNet 1.1
SqueezeNetV11(benchmark::internal::Benchmark * b)1473 static void SqueezeNetV11(benchmark::internal::Benchmark* b) {
1474   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1475 
1476   /************************** Conv 1 *************************/
1477   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1478   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1, 1,    3,   64});
1479   /************************** Fire 2 *************************/
1480   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1481   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   64,   16});
1482   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1483   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1484   /************************** Fire 3 *************************/
1485   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1486   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
1487 //b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1488 //b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1489   /************************** Fire 4 *************************/
1490   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1491   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  128,   32});
1492   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1493   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1494   /************************** Fire 5 *************************/
1495   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1496   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
1497 //b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1498 //b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1499   /************************** Fire 6 *************************/
1500   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1501   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1502   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1503   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1504   /************************** Fire 7 *************************/
1505   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1506   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   48});
1507 //b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1508 //b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1509   /************************** Fire 8 *************************/
1510   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1511   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   64});
1512   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1513   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1514   /************************** Fire 9 *************************/
1515   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1516   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
1517 //b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1518 //b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1519   /************************* Conv 10 *************************/
1520   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1521   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
1522 }
1523 
InceptionV3(benchmark::internal::Benchmark * b)1524 static void InceptionV3(benchmark::internal::Benchmark* b) {
1525   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1526 
1527   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1528   b->Args({1, 299, 299,  3,  3,  0,  0, 2, 1, 1,    3,   32});
1529   b->Args({1, 149, 149,  3,  3,  0,  0, 1, 1, 1,   32,   32});
1530   b->Args({1, 147, 147,  3,  3,  2,  2, 1, 1, 1,   32,   64});
1531   b->Args({1,  73,  73,  1,  1,  0,  0, 1, 1, 1,   64,   80});
1532   b->Args({1,  73,  73,  3,  3,  0,  0, 1, 1, 1,   80,  192});
1533   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   64});
1534   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   48});
1535   b->Args({1,  35,  35,  5,  5,  4,  4, 1, 1, 1,   48,   64});
1536   b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   64,   96});
1537   b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   96,   96});
1538   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   32});
1539   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   64});
1540   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1541   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   64});
1542   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   48});
1543   b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,  288,  384});
1544   b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,   96,   96});
1545   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  192});
1546   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  128});
1547   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  128});
1548   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  192});
1549   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  128});
1550   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  192});
1551   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  160});
1552   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  160});
1553   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  192});
1554   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  160});
1555   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  192});
1556   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  192,  192});
1557   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  192,  192});
1558   b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  320});
1559   b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  192});
1560   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  320});
1561   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  384});
1562   b->Args({1,   8,   8,  1,  3,  0,  2, 1, 1, 1,  384,  384});
1563   b->Args({1,   8,   8,  3,  1,  2,  0, 1, 1, 1,  384,  384});
1564   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  448});
1565   b->Args({1,   8,   8,  3,  3,  2,  2, 1, 1, 1,  448,  384});
1566   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  192});
1567   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  320});
1568   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  384});
1569   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  448});
1570   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  192});
1571   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1, 1, 2048, 1001});
1572 }
1573 
ResNet18(benchmark::internal::Benchmark * b)1574 static void ResNet18(benchmark::internal::Benchmark* b) {
1575   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1576 
1577   /************************* Conv 1 *************************/
1578   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1579   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
1580   /************************ Conv 2.X ************************/
1581   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1582   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1583   /************************ Conv 3.X ************************/
1584   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1585   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,   64,  128});
1586   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1587   b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,   64,  128});
1588   /************************ Conv 4.X ************************/
1589   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1590   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  128,  256});
1591   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1592   b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  128,  256});
1593   /************************ Conv 5.X ************************/
1594   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1595   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  256,  512});
1596   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1597   b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1,  256,  512});
1598 }
1599 
ResNet50(benchmark::internal::Benchmark * b)1600 static void ResNet50(benchmark::internal::Benchmark* b) {
1601   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1602 
1603   /************************* Conv 1 *************************/
1604   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1605   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
1606   /************************ Conv 2.1 ************************/
1607   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1608   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,   64});
1609   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1610   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1611 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1612   /************************ Conv 2.X ************************/
1613   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1614   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,   64});
1615 //b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1616 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1617   /************************ Conv 3.1 ************************/
1618   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1619   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  128});
1620   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,  128,  128});
1621   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
1622   b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,  256,  512});
1623   /************************ Conv 3.X ************************/
1624   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1625   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  128});
1626   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1627 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
1628   /************************ Conv 4.1 ************************/
1629   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1630   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  256});
1631   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  256,  256});
1632   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
1633   b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  512, 1024});
1634   /************************ Conv 4.X ************************/
1635   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1636   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  256});
1637   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1638 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
1639   /************************ Conv 5.1 ************************/
1640   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1641   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  512});
1642   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  512,  512});
1643   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
1644   b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1, 1024, 2048});
1645   /************************ Conv 5.X ************************/
1646   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1647   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1, 2048,  512});
1648   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1649 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
1650 }
1651 
VGG(benchmark::internal::Benchmark * b)1652 static void VGG(benchmark::internal::Benchmark* b) {
1653   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1654 
1655   /************************* Conv 1.1 ************************/
1656   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1657   b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,    3,   64});
1658   /************************* Conv 1.2 ************************/
1659   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1660   b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1661 
1662   /************************* Conv 2.1 ************************/
1663   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1664   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,   64,  128});
1665   /************************* Conv 2.2 ************************/
1666   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1667   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1668 
1669   /************************* Conv 3.1 ************************/
1670   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1671   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  128,  256});
1672   /************************* Conv 3.2 ************************/
1673   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1674   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1675   /************************* Conv 3.3 ************************/
1676   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1677   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  256});
1678 
1679   /************************* Conv 4.1 ************************/
1680   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1681   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  256,  512});
1682   /************************* Conv 4.2 ************************/
1683   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1684   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1685   /************************* Conv 4.3 ************************/
1686   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1687   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  512});
1688 
1689   /************************* Conv 5.X ************************/
1690   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1691   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1692   /************************* Conv 5.3 ************************/
1693   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1694   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  512,  512});
1695 }
1696 
1697 // SRCNN (9-1-5)
SRCNN915(benchmark::internal::Benchmark * b)1698 static void SRCNN915(benchmark::internal::Benchmark* b) {
1699   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1700 
1701   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1702   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1703   b->Args({1, 376, 376,  1,  1,  0,  0, 1, 1, 1,   64,   32});
1704   b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1705 }
1706 
1707 // SRCNN (9-3-5)
SRCNN935(benchmark::internal::Benchmark * b)1708 static void SRCNN935(benchmark::internal::Benchmark* b) {
1709   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1710 
1711   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1712   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1713   b->Args({1, 376, 376,  3,  3,  0,  0, 1, 1, 1,   64,   32});
1714   b->Args({1, 374, 374,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1715 }
1716 
1717 // SRCNN (9-5-5)
SRCNN955(benchmark::internal::Benchmark * b)1718 static void SRCNN955(benchmark::internal::Benchmark* b) {
1719   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1720 
1721   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1722   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1723   b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   64,   32});
1724   b->Args({1, 372, 372,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1725 }
1726 
1727 BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1728 BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1729 BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1730 BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1731 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1732 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1733 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1734 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1735 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1736 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1737 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1738 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1739 BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1740 BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1741 BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1742 BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1743 BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1744 BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1745 BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1746 BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1747 BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1748 BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1749 
1750 BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1751 BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1752 BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1753 BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1754 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1755 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1756 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1757 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1758 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1759 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1760 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1761 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1762 BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1763 BENCHMARK_CAPTURE(xnnpack_convolution_q8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1764 BENCHMARK_CAPTURE(xnnpack_convolution_q8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1765 BENCHMARK_CAPTURE(xnnpack_convolution_q8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1766 BENCHMARK_CAPTURE(xnnpack_convolution_q8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1767 BENCHMARK_CAPTURE(xnnpack_convolution_q8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1768 BENCHMARK_CAPTURE(xnnpack_convolution_q8, vgg, "VGG")->Apply(VGG)->UseRealTime();
1769 BENCHMARK_CAPTURE(xnnpack_convolution_q8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1770 BENCHMARK_CAPTURE(xnnpack_convolution_q8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1771 BENCHMARK_CAPTURE(xnnpack_convolution_q8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1772 
1773 #ifdef BENCHMARK_TENSORFLOW_LITE
1774   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1775   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1776   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1777   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1778   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1779   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1780   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1781   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1782   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1783   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1784   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1785   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1786   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1787   BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1788   BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1789   BENCHMARK_CAPTURE(tflite_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1790   BENCHMARK_CAPTURE(tflite_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1791   BENCHMARK_CAPTURE(tflite_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1792   BENCHMARK_CAPTURE(tflite_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1793   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1794   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1795   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1796 #endif  // BENCHMARK_TENSORFLOW_LITE
1797 
1798 #ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
1799   BENCHMARK_CAPTURE(armcl_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1800   BENCHMARK_CAPTURE(armcl_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1801   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1802   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1803   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1804   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1805   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1806   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1807   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1808   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1809   BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1810   BENCHMARK_CAPTURE(armcl_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1811   BENCHMARK_CAPTURE(armcl_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1812   BENCHMARK_CAPTURE(armcl_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1813   BENCHMARK_CAPTURE(armcl_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1814   BENCHMARK_CAPTURE(armcl_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1815   BENCHMARK_CAPTURE(armcl_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1816   BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1817   BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1818   BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1819 #endif  // BENCHMARK_ARM_COMPUTE_LIBRARY
1820 
1821 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1822 BENCHMARK_MAIN();
1823 #endif
1824