• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <string>
12 #include <vector>
13 
14 #include <cpuinfo.h>
15 #include <xnnpack.h>
16 
17 #include <benchmark/benchmark.h>
18 #ifdef BENCHMARK_TENSORFLOW_LITE
19 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
20 #include "tensorflow/lite/interpreter.h"
21 #include "tensorflow/lite/kernels/register.h"
22 #include "tensorflow/lite/model.h"
23 #include "tensorflow/lite/schema/schema_generated.h"
24 #include "tensorflow/lite/version.h"
25 #endif  // BENCHMARK_TENSORFLOW_LITE */
26 #include "bench/utils.h"
27 
28 
xnnpack_deconvolution_q8(benchmark::State & state,const char * net)29 void xnnpack_deconvolution_q8(benchmark::State& state, const char* net) {
30   const size_t batch_size = state.range(0);
31   const size_t input_height = state.range(1);
32   const size_t input_width = state.range(2);
33   const size_t kernel_height = state.range(3);
34   const size_t kernel_width = state.range(4);
35   const size_t padding = state.range(5);
36   const size_t adjustment = state.range(6);
37   const size_t stride = state.range(7);
38   const size_t dilation = state.range(8);
39   const size_t groups = state.range(9);
40   const size_t group_input_channels = state.range(10);
41   const size_t group_output_channels = state.range(11);
42 
43   std::random_device random_device;
44   auto rng = std::mt19937(random_device());
45   auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
46   auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
47 
48   const size_t output_pixel_stride = groups * group_output_channels;
49   const size_t input_pixel_stride = groups * group_input_channels;
50   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
51   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
52   const size_t padding_left = padding / 2;
53   const size_t padding_top = padding / 2;
54   const size_t padding_right = padding - padding_left;
55   const size_t padding_bottom = padding - padding_top;
56   const size_t output_height = std::max(stride * (input_height - 1) + adjustment + effective_kernel_height, padding) - padding;
57   const size_t output_width = std::max(stride * (input_width - 1) + adjustment + effective_kernel_width, padding) - padding;
58 
59   std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
60   std::generate(input.begin(), input.end(), std::ref(u8rng));
61   std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
62   std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
63   std::vector<int32_t> bias(groups * group_output_channels);
64   std::generate(bias.begin(), bias.end(), std::ref(s32rng));
65   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
66 
67   xnn_status status = xnn_initialize(nullptr /* allocator */);
68   if (status != xnn_status_success) {
69     state.SkipWithError("failed to initialize XNNPACK");
70     return;
71   }
72 
73   if (!cpuinfo_initialize()) {
74     state.SkipWithError("cpuinfo initialization failed");
75     return;
76   }
77   const size_t num_buffers = 1 +
78     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
79       sizeof(float) * (kernel.size() + bias.size() + output_elements));
80   std::vector<uint8_t> output(output_elements * num_buffers);
81 
82   std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
83   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
84     status = xnn_create_deconvolution2d_nhwc_q8(
85         padding_top, padding_right, padding_bottom, padding_left,
86         kernel_height, kernel_width,
87         stride, stride,
88         dilation, dilation,
89         groups, group_input_channels, group_output_channels,
90         input_pixel_stride, output_pixel_stride,
91         127, 0.5f, 127, 0.5f,
92         kernel.data(), bias.data(),
93         127, 0.5f, 0, 255,
94         0 /* flags */,
95         &deconvolution_op);
96     if (status != xnn_status_success) {
97       state.SkipWithError("failed to create QINT8 Deconvolution operator");
98       return;
99     }
100   }
101 
102   for (size_t i = 0; i < deconvolution_operators.size(); i++) {
103     status = xnn_setup_deconvolution2d_nhwc_q8(
104         deconvolution_operators[i],
105         batch_size, input_height, input_width,
106         0 /* height adjustment */, 0 /* width adjustment */,
107         input.data(), output.data() + i * output_elements,
108         nullptr /* thread pool */);
109     if (status != xnn_status_success) {
110       state.SkipWithError("failed to setup QINT8 Deconvolution operator");
111       return;
112     }
113   }
114 
115   size_t buffer_index = 0;
116   for (auto _ : state) {
117     state.PauseTiming();
118     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
119     buffer_index = (buffer_index + 1) % num_buffers;
120     state.ResumeTiming();
121 
122     status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
123     if (status != xnn_status_success) {
124       state.SkipWithError("failed to run QINT8 Deconvolution operator");
125       return;
126     }
127   }
128 
129   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
130     status = xnn_delete_operator(deconvolution_op);
131     if (status != xnn_status_success) {
132       state.SkipWithError("failed to delete QINT8 Deconvolution operator");
133       return;
134     }
135     deconvolution_op = nullptr;
136   }
137 
138     state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
139     state.counters["OPS"] = benchmark::Counter(
140     uint64_t(state.iterations()) * 2 *
141       batch_size * input_width * input_width *
142       groups * group_input_channels * group_output_channels *
143       kernel_height * kernel_width,
144     benchmark::Counter::kIsRate);
145 }
146 
xnnpack_deconvolution_f32(benchmark::State & state,const char * net)147 void xnnpack_deconvolution_f32(benchmark::State& state, const char* net) {
148   const size_t batch_size = state.range(0);
149   const size_t input_height = state.range(1);
150   const size_t input_width = state.range(2);
151   const size_t kernel_height = state.range(3);
152   const size_t kernel_width = state.range(4);
153   const size_t padding = state.range(5);
154   const size_t adjustment = state.range(6);
155   const size_t stride = state.range(7);
156   const size_t dilation = state.range(8);
157   const size_t groups = state.range(9);
158   const size_t group_input_channels = state.range(10);
159   const size_t group_output_channels = state.range(11);
160 
161   std::random_device random_device;
162   auto rng = std::mt19937(random_device());
163   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
164 
165   const size_t output_pixel_stride = groups * group_output_channels;
166   const size_t input_pixel_stride = groups * group_input_channels;
167   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
168   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
169   const size_t padding_left = padding / 2;
170   const size_t padding_top = padding / 2;
171   const size_t padding_right = padding - padding_left;
172   const size_t padding_bottom = padding - padding_top;
173   const size_t output_height = std::max(stride * (input_height - 1) + adjustment + effective_kernel_height, padding) - padding;
174   const size_t output_width = std::max(stride * (input_width - 1) + adjustment + effective_kernel_width, padding) - padding;
175 
176   std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
177   std::generate(input.begin(), input.end(), std::ref(f32rng));
178   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
179   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
180   std::vector<float> bias(groups * group_output_channels);
181   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
182   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
183 
184   xnn_status status = xnn_initialize(nullptr /* allocator */);
185   if (status != xnn_status_success) {
186     state.SkipWithError("failed to initialize XNNPACK");
187     return;
188   }
189 
190   if (!cpuinfo_initialize()) {
191     state.SkipWithError("cpuinfo initialization failed");
192     return;
193   }
194   const size_t num_buffers = 1 +
195     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
196       sizeof(float) * (kernel.size() + bias.size() + output_elements));
197   std::vector<float> output(output_elements * num_buffers);
198 
199   std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
200   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
201     status = xnn_create_deconvolution2d_nhwc_f32(
202         padding_top, padding_right, padding_bottom, padding_left,
203         kernel_height, kernel_width,
204         stride, stride,
205         dilation, dilation,
206         groups, group_input_channels, group_output_channels,
207         input_pixel_stride, output_pixel_stride,
208         kernel.data(), bias.data(),
209         -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
210         0 /* flags */,
211         &deconvolution_op);
212     if (status != xnn_status_success) {
213       state.SkipWithError("failed to create FP32 Deconvolution operator");
214       return;
215     }
216   }
217 
218   for (size_t i = 0; i < deconvolution_operators.size(); i++) {
219     status = xnn_setup_deconvolution2d_nhwc_f32(
220         deconvolution_operators[i],
221         batch_size, input_height, input_width,
222         0 /* height adjustment */, 0 /* width adjustment */,
223         input.data(), output.data() + i * output_elements,
224         nullptr /* thread pool */);
225     if (status != xnn_status_success) {
226       state.SkipWithError("failed to setup QINT8 Deconvolution operator");
227       return;
228     }
229   }
230 
231   size_t buffer_index = 0;
232   for (auto _ : state) {
233     state.PauseTiming();
234     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
235     buffer_index = (buffer_index + 1) % num_buffers;
236     state.ResumeTiming();
237 
238     status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
239     if (status != xnn_status_success) {
240       state.SkipWithError("failed to run FP32 Deconvolution operator");
241       return;
242     }
243   }
244 
245   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
246     status = xnn_delete_operator(deconvolution_op);
247     if (status != xnn_status_success) {
248       state.SkipWithError("failed to delete FP32 Deconvolution operator");
249       return;
250     }
251     deconvolution_op = nullptr;
252   }
253 
254   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
255   state.counters["FLOPS"] = benchmark::Counter(
256     uint64_t(state.iterations()) * 2 *
257       batch_size * input_width * input_width *
258       groups * group_input_channels * group_output_channels *
259       kernel_height * kernel_width,
260     benchmark::Counter::kIsRate);
261 }
262 
263 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_deconvolution_f32(benchmark::State & state,const char * net)264 void tflite_deconvolution_f32(benchmark::State& state, const char* net) {
265   const size_t batch_size = state.range(0);
266   const size_t input_height = state.range(1);
267   const size_t input_width = state.range(2);
268   const size_t kernel_height = state.range(3);
269   const size_t kernel_width = state.range(4);
270   const size_t padding = state.range(5);
271   const size_t adjustment = state.range(6);
272   const size_t stride = state.range(7);
273   const size_t dilation = state.range(8);
274   const size_t groups = state.range(9);
275   const size_t input_channels = state.range(10);
276   const size_t output_channels = state.range(11);
277 
278   if (groups != 1) {
279     state.SkipWithError("grouped deconvolution is not supported");
280     return;
281   }
282   if (dilation != 1) {
283     state.SkipWithError("dilated deconvolution is not supported");
284     return;
285   }
286 
287   std::random_device random_device;
288   auto rng = std::mt19937(random_device());
289   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
290 
291   tflite::Padding tf_padding = tflite::Padding_VALID;
292   if (padding == (kernel_width - 1) && padding == (kernel_height - 1)) {
293     tf_padding = tflite::Padding_SAME;
294   } else if (padding == 0) {
295     tf_padding = tflite::Padding_VALID;
296   } else {
297     state.SkipWithError("unsupported padding");
298     return;
299   }
300 
301   const size_t output_height = std::max(stride * (input_height - 1) + adjustment + kernel_height, padding) - padding;
302   const size_t output_width = std::max(stride * (input_width - 1) + adjustment + kernel_width, padding) - padding;
303 
304   std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
305   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
306 
307   flatbuffers::FlatBufferBuilder builder;
308   flatbuffers::Offset<tflite::OperatorCode> operator_code =
309       CreateOperatorCode(builder, tflite::BuiltinOperator_TRANSPOSE_CONV, 0);
310 
311   flatbuffers::Offset<tflite::TransposeConvOptions> transpose_conv_options = CreateTransposeConvOptions(
312       builder,
313       tf_padding,
314       static_cast<int32_t>(stride), static_cast<int32_t>(stride));
315 
316   const int32_t input_shape[4] = {
317     static_cast<int32_t>(batch_size),
318     static_cast<int32_t>(input_height),
319     static_cast<int32_t>(input_width),
320     static_cast<int32_t>(input_channels)
321   };
322   const int32_t output_shape[4] = {
323     static_cast<int32_t>(batch_size),
324     static_cast<int32_t>(output_height),
325     static_cast<int32_t>(output_width),
326     static_cast<int32_t>(output_channels)
327   };
328   const int32_t filter_shape[4] = {
329     static_cast<int32_t>(output_channels),
330     static_cast<int32_t>(kernel_height),
331     static_cast<int32_t>(kernel_width),
332     static_cast<int32_t>(input_channels)
333   };
334   const int32_t output_shape_shape[1] = { 4 };
335 
336   flatbuffers::Offset<tflite::Buffer> buffers[3] = {
337     tflite::CreateBuffer(builder, builder.CreateVector({})),
338     tflite::CreateBuffer(builder, builder.CreateVector(
339       reinterpret_cast<const uint8_t*>(kernel.data()),
340       sizeof(float) * kernel.size())),
341     tflite::CreateBuffer(builder, builder.CreateVector(
342       reinterpret_cast<const uint8_t*>(output_shape),
343       sizeof(output_shape))),
344   };
345 
346   flatbuffers::Offset<tflite::Tensor> tensors[4] = {
347     tflite::CreateTensor(builder,
348                          builder.CreateVector<int32_t>(output_shape_shape, 1),
349                          tflite::TensorType_INT32,
350                          2 /* buffer id */,
351                          builder.CreateString("output_shape")),
352     tflite::CreateTensor(builder,
353                          builder.CreateVector<int32_t>(filter_shape, 4),
354                          tflite::TensorType_FLOAT32,
355                          1 /* buffer id */,
356                          builder.CreateString("filter")),
357     tflite::CreateTensor(builder,
358                          builder.CreateVector<int32_t>(input_shape, 4),
359                          tflite::TensorType_FLOAT32,
360                          0 /* buffer id */,
361                          builder.CreateString("input")),
362     tflite::CreateTensor(builder,
363                          builder.CreateVector<int32_t>(output_shape, 4),
364                          tflite::TensorType_FLOAT32,
365                          0 /* buffer id */,
366                          builder.CreateString("output")),
367   };
368 
369   const int32_t op_inputs[3] = { 0, 1, 2 };
370   const int32_t op_outputs[1] = { 3 };
371   flatbuffers::Offset<tflite::Operator> op = CreateOperator(
372       builder,
373       0 /* opcode_index */,
374       builder.CreateVector<int32_t>(op_inputs, 3),
375       builder.CreateVector<int32_t>(op_outputs, 1),
376       tflite::BuiltinOptions_TransposeConvOptions,
377       transpose_conv_options.Union());
378 
379   const int32_t graph_inputs[1] = { 2 };
380   const int32_t graph_outputs[1] = { 3 };
381   flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
382       builder,
383       builder.CreateVector(tensors, 4),
384       builder.CreateVector<int32_t>(graph_inputs, 1),
385       builder.CreateVector<int32_t>(graph_outputs, 1),
386       builder.CreateVector(&op, 1),
387       builder.CreateString("TransposeConv subgraph"));
388 
389   flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("TransposeConv model");
390 
391   flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
392       TFLITE_SCHEMA_VERSION,
393       builder.CreateVector(&operator_code, 1),
394       builder.CreateVector(&subgraph, 1),
395       description,
396       builder.CreateVector(buffers, 3));
397 
398   builder.Finish(model_buffer);
399 
400   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
401   tflite::ops::builtin::BuiltinOpResolver resolver;
402   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
403   std::unique_ptr<tflite::Interpreter> interpreter;
404   if (interpreterBuilder(&interpreter) != kTfLiteOk) {
405     state.SkipWithError("failed to create TFLite interpreter");
406     return;
407   }
408   if (interpreter == nullptr) {
409     state.SkipWithError("TFLite interpreter is null");
410     return;
411   }
412   interpreter->SetNumThreads(1);
413 
414   if (interpreter->AllocateTensors() != kTfLiteOk) {
415     state.SkipWithError("failed to allocate tensors");
416     return;
417   }
418 
419   std::generate(
420     interpreter->typed_tensor<float>(2),
421     interpreter->typed_tensor<float>(2) + batch_size * input_channels * input_height * input_width,
422     std::ref(f32rng));
423 
424   for (auto _ : state) {
425     state.PauseTiming();
426     benchmark::utils::WipeCache();
427     benchmark::utils::PrefetchToL1(
428       interpreter->typed_tensor<float>(2),
429       batch_size * input_channels * input_height * input_width * sizeof(float));
430     state.ResumeTiming();
431 
432     if (interpreter->Invoke() != kTfLiteOk) {
433       state.SkipWithError("failed to invoke TFLite interpreter");
434       return;
435     }
436   }
437 
438   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
439   state.counters["FLOPS"] = benchmark::Counter(
440     uint64_t(state.iterations()) * 2 *
441       batch_size * input_width * input_width *
442       input_channels * output_channels *
443       kernel_height * kernel_width,
444     benchmark::Counter::kIsRate);
445 
446   interpreter.reset();
447 }
448 #endif  // BENCHMARK_TENSORFLOW_LITE
449 
450 // FCN-32 model (PASCAL VOC version).
451 // We assume CIF image (352x288) on model input / output.
FCN32(benchmark::internal::Benchmark * b)452 static void FCN32(benchmark::internal::Benchmark* b) {
453   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
454 
455   /*       N  H   W  KH  KW  P  A   S  D  G  GCin  GCout */
456   b->Args({1, 9, 11, 64, 64, 0, 0, 32, 1, 1,   21,   21});
457 }
458 
459 // FCN-16 model (PASCAL VOC version).
460 // We assume CIF image (352x288) on model input / output.
FCN16(benchmark::internal::Benchmark * b)461 static void FCN16(benchmark::internal::Benchmark* b) {
462   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
463 
464   /*       N   H   W  KH  KW  P  A   S  D  G  GCin  GCout */
465   b->Args({1,  9, 11,  4,  4, 0, 0,  2, 1, 1,   21,   21});
466   b->Args({1, 18, 22, 32, 32, 0, 0, 16, 1, 1,   21,   21});
467 }
468 
469 // FCN-8 model (PASCAL VOC version).
470 // We assume CIF image (352x288) on model input / output.
FCN8(benchmark::internal::Benchmark * b)471 static void FCN8(benchmark::internal::Benchmark* b) {
472   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
473 
474   /*       N   H   W  KH  KW  P  A  S  D  G  GCin  GCout */
475   b->Args({1,  9, 11,  4,  4, 0, 0, 2, 1, 1,   21,   21});
476   b->Args({1, 18, 22,  4,  4, 0, 0, 2, 1, 1,   21,   21});
477   b->Args({1, 36, 44, 16, 16, 0, 0, 8, 1, 1,   21,   21});
478 }
479 
ENet(benchmark::internal::Benchmark * b)480 static void ENet(benchmark::internal::Benchmark* b) {
481   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
482 
483   /********************* Bottleneck 4.0 ********************/
484   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
485   b->Args({1,  64,  64,  3,  3, 2, 1, 2, 1, 1,   32,   32});
486   /********************* Bottleneck 5.0 ********************/
487   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
488   b->Args({1, 128, 128,  3,  3, 2, 1, 2, 1, 1,   16,   16});
489   /***************** Final Full Convolution ****************/
490   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
491   b->Args({1, 256, 256,  2,  2, 0, 0, 2, 1, 1,   16,   12});
492 }
493 
ESPNet(benchmark::internal::Benchmark * b)494 static void ESPNet(benchmark::internal::Benchmark* b) {
495   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
496 
497   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
498   b->Args({1,  64, 128,  2,  2, 0, 0, 2, 1, 1,   20,   20});
499   b->Args({1, 128, 256,  2,  2, 0, 0, 2, 1, 1,   20,   20});
500   b->Args({1, 256, 512,  2,  2, 0, 0, 2, 1, 1,   20,   20});
501 }
502 
503 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn32, "FCN-32")->Apply(FCN32)->UseRealTime();
504 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn16, "FCN-16")->Apply(FCN16)->UseRealTime();
505 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn8, "FCN-8")->Apply(FCN8)->UseRealTime();
506 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, enet, "ENet")->Apply(ENet)->UseRealTime();
507 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, espnet, "ESPNet")->Apply(ESPNet)->UseRealTime();
508 
509 BENCHMARK_CAPTURE(xnnpack_deconvolution_q8, fcn32, "FCN-32")->Apply(FCN32)->UseRealTime();
510 BENCHMARK_CAPTURE(xnnpack_deconvolution_q8, fcn16, "FCN-16")->Apply(FCN16)->UseRealTime();
511 BENCHMARK_CAPTURE(xnnpack_deconvolution_q8, fcn8, "FCN-8")->Apply(FCN8)->UseRealTime();
512 BENCHMARK_CAPTURE(xnnpack_deconvolution_q8, enet, "ENet")->Apply(ENet)->UseRealTime();
513 BENCHMARK_CAPTURE(xnnpack_deconvolution_q8, espnet, "ESPNet")->Apply(ESPNet)->UseRealTime();
514 
515 #ifdef BENCHMARK_TENSORFLOW_LITE
516   BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn32, "FCN-32")->Apply(FCN32)->UseRealTime();
517   BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn16, "FCN-16")->Apply(FCN16)->UseRealTime();
518   BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn8, "FCN-8")->Apply(FCN8)->UseRealTime();
519   BENCHMARK_CAPTURE(tflite_deconvolution_f32, enet, "ENet")->Apply(ENet)->UseRealTime();
520   BENCHMARK_CAPTURE(tflite_deconvolution_f32, espnet, "ESPNet")->Apply(ESPNet)->UseRealTime();
521 #endif  // BENCHMARK_TENSORFLOW_LITE
522 
523 #ifndef XNNPACK_BENCHMARK_NO_MAIN
524 BENCHMARK_MAIN();
525 #endif
526