• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <string>
12 #include <vector>
13 
14 #include <xnnpack.h>
15 
16 #include <benchmark/benchmark.h>
17 #ifdef BENCHMARK_TENSORFLOW_LITE
18 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
19 #include "tensorflow/lite/interpreter.h"
20 #include "tensorflow/lite/kernels/register.h"
21 #include "tensorflow/lite/model.h"
22 #include "tensorflow/lite/schema/schema_generated.h"
23 #include "tensorflow/lite/version.h"
24 #endif  // BENCHMARK_TENSORFLOW_LITE */
25 #include "bench/utils.h"
26 
27 #ifndef XNN_NO_QU8_OPERATORS
xnnpack_deconvolution_qu8(benchmark::State & state,const char * net)28 void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {
29   const size_t batch_size = state.range(0);
30   const size_t input_height = state.range(1);
31   const size_t input_width = state.range(2);
32   const size_t kernel_height = state.range(3);
33   const size_t kernel_width = state.range(4);
34   const size_t padding = state.range(5);
35   const size_t adjustment = state.range(6);
36   const size_t stride = state.range(7);
37   const size_t dilation = state.range(8);
38   const size_t groups = state.range(9);
39   const size_t group_input_channels = state.range(10);
40   const size_t group_output_channels = state.range(11);
41 
42   std::random_device random_device;
43   auto rng = std::mt19937(random_device());
44   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
45   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
46 
47   const size_t output_pixel_stride = groups * group_output_channels;
48   const size_t input_pixel_stride = groups * group_input_channels;
49   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
50   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
51   const size_t padding_left = padding / 2;
52   const size_t padding_top = padding / 2;
53   const size_t padding_right = padding - padding_left;
54   const size_t padding_bottom = padding - padding_top;
55   const size_t output_height = std::max(stride * (input_height - 1) + adjustment + effective_kernel_height, padding) - padding;
56   const size_t output_width = std::max(stride * (input_width - 1) + adjustment + effective_kernel_width, padding) - padding;
57 
58   std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
59   std::generate(input.begin(), input.end(), std::ref(u8rng));
60   std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
61   std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
62   std::vector<int32_t> bias(groups * group_output_channels);
63   std::generate(bias.begin(), bias.end(), std::ref(i32rng));
64   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
65 
66   xnn_status status = xnn_initialize(nullptr /* allocator */);
67   if (status != xnn_status_success) {
68     state.SkipWithError("failed to initialize XNNPACK");
69     return;
70   }
71 
72   const size_t num_buffers = 1 +
73     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
74       sizeof(float) * (kernel.size() + bias.size() + output_elements));
75   std::vector<uint8_t> output(output_elements * num_buffers);
76 
77   std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
78   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
79     status = xnn_create_deconvolution2d_nhwc_qu8(
80         padding_top, padding_right, padding_bottom, padding_left,
81         kernel_height, kernel_width,
82         stride, stride,
83         dilation, dilation,
84         groups, group_input_channels, group_output_channels,
85         input_pixel_stride, output_pixel_stride,
86         127, 0.5f, 127, 0.5f,
87         kernel.data(), bias.data(),
88         127, 0.5f, 0, 255,
89         0 /* flags */,
90         &deconvolution_op);
91     if (status != xnn_status_success) {
92       state.SkipWithError("failed to create QINT8 Deconvolution operator");
93       return;
94     }
95   }
96 
97   for (size_t i = 0; i < deconvolution_operators.size(); i++) {
98     status = xnn_setup_deconvolution2d_nhwc_qu8(
99         deconvolution_operators[i],
100         batch_size, input_height, input_width,
101         0 /* height adjustment */, 0 /* width adjustment */,
102         input.data(), output.data() + i * output_elements,
103         nullptr /* thread pool */);
104     if (status != xnn_status_success) {
105       state.SkipWithError("failed to setup QINT8 Deconvolution operator");
106       return;
107     }
108   }
109 
110   size_t buffer_index = 0;
111   for (auto _ : state) {
112     state.PauseTiming();
113     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
114     buffer_index = (buffer_index + 1) % num_buffers;
115     state.ResumeTiming();
116 
117     status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
118     if (status != xnn_status_success) {
119       state.SkipWithError("failed to run QINT8 Deconvolution operator");
120       return;
121     }
122   }
123 
124   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
125     status = xnn_delete_operator(deconvolution_op);
126     if (status != xnn_status_success) {
127       state.SkipWithError("failed to delete QINT8 Deconvolution operator");
128       return;
129     }
130     deconvolution_op = nullptr;
131   }
132 
133   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
134   if (cpu_frequency != 0) {
135     state.counters["cpufreq"] = cpu_frequency;
136   }
137 
138   state.counters["OPS"] = benchmark::Counter(
139   uint64_t(state.iterations()) * 2 *
140     batch_size * input_width * input_width *
141     groups * group_input_channels * group_output_channels *
142     kernel_height * kernel_width,
143   benchmark::Counter::kIsRate);
144 }
145 #endif  // XNN_NO_QU8_OPERATORS
146 
xnnpack_deconvolution_f32(benchmark::State & state,const char * net)147 void xnnpack_deconvolution_f32(benchmark::State& state, const char* net) {
148   const size_t batch_size = state.range(0);
149   const size_t input_height = state.range(1);
150   const size_t input_width = state.range(2);
151   const size_t kernel_height = state.range(3);
152   const size_t kernel_width = state.range(4);
153   const size_t padding = state.range(5);
154   const size_t adjustment = state.range(6);
155   const size_t stride = state.range(7);
156   const size_t dilation = state.range(8);
157   const size_t groups = state.range(9);
158   const size_t group_input_channels = state.range(10);
159   const size_t group_output_channels = state.range(11);
160 
161   std::random_device random_device;
162   auto rng = std::mt19937(random_device());
163   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
164 
165   const size_t output_pixel_stride = groups * group_output_channels;
166   const size_t input_pixel_stride = groups * group_input_channels;
167   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
168   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
169   const size_t padding_left = padding / 2;
170   const size_t padding_top = padding / 2;
171   const size_t padding_right = padding - padding_left;
172   const size_t padding_bottom = padding - padding_top;
173   const size_t output_height = std::max(stride * (input_height - 1) + adjustment + effective_kernel_height, padding) - padding;
174   const size_t output_width = std::max(stride * (input_width - 1) + adjustment + effective_kernel_width, padding) - padding;
175 
176   std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
177   std::generate(input.begin(), input.end(), std::ref(f32rng));
178   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
179   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
180   std::vector<float> bias(groups * group_output_channels);
181   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
182   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
183 
184   xnn_status status = xnn_initialize(nullptr /* allocator */);
185   if (status != xnn_status_success) {
186     state.SkipWithError("failed to initialize XNNPACK");
187     return;
188   }
189 
190   const size_t num_buffers = 1 +
191     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
192       sizeof(float) * (kernel.size() + bias.size() + output_elements));
193   std::vector<float> output(output_elements * num_buffers);
194 
195   std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
196   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
197     status = xnn_create_deconvolution2d_nhwc_f32(
198         padding_top, padding_right, padding_bottom, padding_left,
199         kernel_height, kernel_width,
200         stride, stride,
201         dilation, dilation,
202         groups, group_input_channels, group_output_channels,
203         input_pixel_stride, output_pixel_stride,
204         kernel.data(), bias.data(),
205         -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
206         0 /* flags */,
207         &deconvolution_op);
208     if (status != xnn_status_success) {
209       state.SkipWithError("failed to create FP32 Deconvolution operator");
210       return;
211     }
212   }
213 
214   for (size_t i = 0; i < deconvolution_operators.size(); i++) {
215     status = xnn_setup_deconvolution2d_nhwc_f32(
216         deconvolution_operators[i],
217         batch_size, input_height, input_width,
218         0 /* height adjustment */, 0 /* width adjustment */,
219         input.data(), output.data() + i * output_elements,
220         nullptr /* thread pool */);
221     if (status != xnn_status_success) {
222       state.SkipWithError("failed to setup QINT8 Deconvolution operator");
223       return;
224     }
225   }
226 
227   size_t buffer_index = 0;
228   for (auto _ : state) {
229     state.PauseTiming();
230     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
231     buffer_index = (buffer_index + 1) % num_buffers;
232     state.ResumeTiming();
233 
234     status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
235     if (status != xnn_status_success) {
236       state.SkipWithError("failed to run FP32 Deconvolution operator");
237       return;
238     }
239   }
240 
241   for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
242     status = xnn_delete_operator(deconvolution_op);
243     if (status != xnn_status_success) {
244       state.SkipWithError("failed to delete FP32 Deconvolution operator");
245       return;
246     }
247     deconvolution_op = nullptr;
248   }
249 
250   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
251   if (cpu_frequency != 0) {
252     state.counters["cpufreq"] = cpu_frequency;
253   }
254 
255   state.counters["FLOPS"] = benchmark::Counter(
256     uint64_t(state.iterations()) * 2 *
257       batch_size * input_width * input_width *
258       groups * group_input_channels * group_output_channels *
259       kernel_height * kernel_width,
260     benchmark::Counter::kIsRate);
261 }
262 
263 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_deconvolution_f32(benchmark::State & state,const char * net)264 void tflite_deconvolution_f32(benchmark::State& state, const char* net) {
265   const size_t batch_size = state.range(0);
266   const size_t input_height = state.range(1);
267   const size_t input_width = state.range(2);
268   const size_t kernel_height = state.range(3);
269   const size_t kernel_width = state.range(4);
270   const size_t padding = state.range(5);
271   const size_t adjustment = state.range(6);
272   const size_t stride = state.range(7);
273   const size_t dilation = state.range(8);
274   const size_t groups = state.range(9);
275   const size_t input_channels = state.range(10);
276   const size_t output_channels = state.range(11);
277 
278   if (groups != 1) {
279     state.SkipWithError("grouped deconvolution is not supported");
280     return;
281   }
282   if (dilation != 1) {
283     state.SkipWithError("dilated deconvolution is not supported");
284     return;
285   }
286 
287   std::random_device random_device;
288   auto rng = std::mt19937(random_device());
289   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
290 
291   tflite::Padding tf_padding = tflite::Padding_VALID;
292   if (padding == (kernel_width - 1) && padding == (kernel_height - 1)) {
293     tf_padding = tflite::Padding_SAME;
294   } else if (padding == 0) {
295     tf_padding = tflite::Padding_VALID;
296   } else {
297     state.SkipWithError("unsupported padding");
298     return;
299   }
300 
301   const size_t output_height = std::max(stride * (input_height - 1) + adjustment + kernel_height, padding) - padding;
302   const size_t output_width = std::max(stride * (input_width - 1) + adjustment + kernel_width, padding) - padding;
303 
304   std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
305   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
306 
307   flatbuffers::FlatBufferBuilder builder;
308   flatbuffers::Offset<tflite::OperatorCode> operator_code =
309       CreateOperatorCode(builder, tflite::BuiltinOperator_TRANSPOSE_CONV, 0);
310 
311   flatbuffers::Offset<tflite::TransposeConvOptions> transpose_conv_options = CreateTransposeConvOptions(
312       builder,
313       tf_padding,
314       static_cast<int32_t>(stride), static_cast<int32_t>(stride));
315 
316   const int32_t input_shape[4] = {
317     static_cast<int32_t>(batch_size),
318     static_cast<int32_t>(input_height),
319     static_cast<int32_t>(input_width),
320     static_cast<int32_t>(input_channels)
321   };
322   const int32_t output_shape[4] = {
323     static_cast<int32_t>(batch_size),
324     static_cast<int32_t>(output_height),
325     static_cast<int32_t>(output_width),
326     static_cast<int32_t>(output_channels)
327   };
328   const int32_t filter_shape[4] = {
329     static_cast<int32_t>(output_channels),
330     static_cast<int32_t>(kernel_height),
331     static_cast<int32_t>(kernel_width),
332     static_cast<int32_t>(input_channels)
333   };
334   const int32_t output_shape_shape[1] = { 4 };
335 
336   flatbuffers::Offset<tflite::Buffer> buffers[3] = {
337     tflite::CreateBuffer(builder, builder.CreateVector({})),
338     tflite::CreateBuffer(builder, builder.CreateVector(
339       reinterpret_cast<const uint8_t*>(kernel.data()),
340       sizeof(float) * kernel.size())),
341     tflite::CreateBuffer(builder, builder.CreateVector(
342       reinterpret_cast<const uint8_t*>(output_shape),
343       sizeof(output_shape))),
344   };
345 
346   flatbuffers::Offset<tflite::Tensor> tensors[4] = {
347     tflite::CreateTensor(builder,
348                          builder.CreateVector<int32_t>(output_shape_shape, 1),
349                          tflite::TensorType_INT32,
350                          2 /* buffer id */,
351                          builder.CreateString("output_shape")),
352     tflite::CreateTensor(builder,
353                          builder.CreateVector<int32_t>(filter_shape, 4),
354                          tflite::TensorType_FLOAT32,
355                          1 /* buffer id */,
356                          builder.CreateString("filter")),
357     tflite::CreateTensor(builder,
358                          builder.CreateVector<int32_t>(input_shape, 4),
359                          tflite::TensorType_FLOAT32,
360                          0 /* buffer id */,
361                          builder.CreateString("input")),
362     tflite::CreateTensor(builder,
363                          builder.CreateVector<int32_t>(output_shape, 4),
364                          tflite::TensorType_FLOAT32,
365                          0 /* buffer id */,
366                          builder.CreateString("output")),
367   };
368 
369   const int32_t op_inputs[3] = { 0, 1, 2 };
370   const int32_t op_outputs[1] = { 3 };
371   flatbuffers::Offset<tflite::Operator> op = CreateOperator(
372       builder,
373       0 /* opcode_index */,
374       builder.CreateVector<int32_t>(op_inputs, 3),
375       builder.CreateVector<int32_t>(op_outputs, 1),
376       tflite::BuiltinOptions_TransposeConvOptions,
377       transpose_conv_options.Union());
378 
379   const int32_t graph_inputs[1] = { 2 };
380   const int32_t graph_outputs[1] = { 3 };
381   flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
382       builder,
383       builder.CreateVector(tensors, 4),
384       builder.CreateVector<int32_t>(graph_inputs, 1),
385       builder.CreateVector<int32_t>(graph_outputs, 1),
386       builder.CreateVector(&op, 1),
387       builder.CreateString("TransposeConv subgraph"));
388 
389   flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("TransposeConv model");
390 
391   flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
392       TFLITE_SCHEMA_VERSION,
393       builder.CreateVector(&operator_code, 1),
394       builder.CreateVector(&subgraph, 1),
395       description,
396       builder.CreateVector(buffers, 3));
397 
398   builder.Finish(model_buffer);
399 
400   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
401   tflite::ops::builtin::BuiltinOpResolver resolver;
402   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
403   std::unique_ptr<tflite::Interpreter> interpreter;
404   if (interpreterBuilder(&interpreter) != kTfLiteOk) {
405     state.SkipWithError("failed to create TFLite interpreter");
406     return;
407   }
408   if (interpreter == nullptr) {
409     state.SkipWithError("TFLite interpreter is null");
410     return;
411   }
412   interpreter->SetNumThreads(1);
413 
414   if (interpreter->AllocateTensors() != kTfLiteOk) {
415     state.SkipWithError("failed to allocate tensors");
416     return;
417   }
418 
419   std::generate(
420     interpreter->typed_tensor<float>(2),
421     interpreter->typed_tensor<float>(2) + batch_size * input_channels * input_height * input_width,
422     std::ref(f32rng));
423 
424   for (auto _ : state) {
425     state.PauseTiming();
426     benchmark::utils::WipeCache();
427     benchmark::utils::PrefetchToL1(
428       interpreter->typed_tensor<float>(2),
429       batch_size * input_channels * input_height * input_width * sizeof(float));
430     state.ResumeTiming();
431 
432     if (interpreter->Invoke() != kTfLiteOk) {
433       state.SkipWithError("failed to invoke TFLite interpreter");
434       return;
435     }
436   }
437 
438   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
439   if (cpu_frequency != 0) {
440     state.counters["cpufreq"] = cpu_frequency;
441   }
442 
443   state.counters["FLOPS"] = benchmark::Counter(
444     uint64_t(state.iterations()) * 2 *
445       batch_size * input_width * input_width *
446       input_channels * output_channels *
447       kernel_height * kernel_width,
448     benchmark::Counter::kIsRate);
449 
450   interpreter.reset();
451 }
452 #endif  // BENCHMARK_TENSORFLOW_LITE
453 
454 // FCN-32 model (PASCAL VOC version).
455 // We assume CIF image (352x288) on model input / output.
FCN32(benchmark::internal::Benchmark * b)456 static void FCN32(benchmark::internal::Benchmark* b) {
457   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
458 
459   /*       N  H   W  KH  KW  P  A   S  D  G  GCin  GCout */
460   b->Args({1, 9, 11, 64, 64, 0, 0, 32, 1, 1,   21,   21});
461 }
462 
463 // FCN-16 model (PASCAL VOC version).
464 // We assume CIF image (352x288) on model input / output.
FCN16(benchmark::internal::Benchmark * b)465 static void FCN16(benchmark::internal::Benchmark* b) {
466   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
467 
468   /*       N   H   W  KH  KW  P  A   S  D  G  GCin  GCout */
469   b->Args({1,  9, 11,  4,  4, 0, 0,  2, 1, 1,   21,   21});
470   b->Args({1, 18, 22, 32, 32, 0, 0, 16, 1, 1,   21,   21});
471 }
472 
473 // FCN-8 model (PASCAL VOC version).
474 // We assume CIF image (352x288) on model input / output.
FCN8(benchmark::internal::Benchmark * b)475 static void FCN8(benchmark::internal::Benchmark* b) {
476   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
477 
478   /*       N   H   W  KH  KW  P  A  S  D  G  GCin  GCout */
479   b->Args({1,  9, 11,  4,  4, 0, 0, 2, 1, 1,   21,   21});
480   b->Args({1, 18, 22,  4,  4, 0, 0, 2, 1, 1,   21,   21});
481   b->Args({1, 36, 44, 16, 16, 0, 0, 8, 1, 1,   21,   21});
482 }
483 
ENet(benchmark::internal::Benchmark * b)484 static void ENet(benchmark::internal::Benchmark* b) {
485   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
486 
487   /********************* Bottleneck 4.0 ********************/
488   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
489   b->Args({1,  64,  64,  3,  3, 2, 1, 2, 1, 1,   32,   32});
490   /********************* Bottleneck 5.0 ********************/
491   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
492   b->Args({1, 128, 128,  3,  3, 2, 1, 2, 1, 1,   16,   16});
493   /***************** Final Full Convolution ****************/
494   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
495   b->Args({1, 256, 256,  2,  2, 0, 0, 2, 1, 1,   16,   12});
496 }
497 
ESPNet(benchmark::internal::Benchmark * b)498 static void ESPNet(benchmark::internal::Benchmark* b) {
499   b->ArgNames({"N", "H", "W", "KH", "KW", "P", "A", "S", "D", "G", "GCin", "GCout"});
500 
501   /*       N   H    W   KH  KW  P  A  S  D  G  GCin  GCout */
502   b->Args({1,  64, 128,  2,  2, 0, 0, 2, 1, 1,   20,   20});
503   b->Args({1, 128, 256,  2,  2, 0, 0, 2, 1, 1,   20,   20});
504   b->Args({1, 256, 512,  2,  2, 0, 0, 2, 1, 1,   20,   20});
505 }
506 
507 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn32, "FCN-32")->Apply(FCN32)->UseRealTime();
508 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn16, "FCN-16")->Apply(FCN16)->UseRealTime();
509 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn8, "FCN-8")->Apply(FCN8)->UseRealTime();
510 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, enet, "ENet")->Apply(ENet)->UseRealTime();
511 BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, espnet, "ESPNet")->Apply(ESPNet)->UseRealTime();
512 
513 #ifndef XNN_NO_QU8_OPERATORS
514 BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn32, "FCN-32")->Apply(FCN32)->UseRealTime();
515 BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn16, "FCN-16")->Apply(FCN16)->UseRealTime();
516 BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn8, "FCN-8")->Apply(FCN8)->UseRealTime();
517 BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, enet, "ENet")->Apply(ENet)->UseRealTime();
518 BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, espnet, "ESPNet")->Apply(ESPNet)->UseRealTime();
519 #endif  // XNN_NO_QU8_OPERATORS
520 
521 #ifdef BENCHMARK_TENSORFLOW_LITE
522   BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn32, "FCN-32")->Apply(FCN32)->UseRealTime();
523   BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn16, "FCN-16")->Apply(FCN16)->UseRealTime();
524   BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn8, "FCN-8")->Apply(FCN8)->UseRealTime();
525   BENCHMARK_CAPTURE(tflite_deconvolution_f32, enet, "ENet")->Apply(ENet)->UseRealTime();
526   BENCHMARK_CAPTURE(tflite_deconvolution_f32, espnet, "ESPNet")->Apply(ESPNet)->UseRealTime();
527 #endif  // BENCHMARK_TENSORFLOW_LITE
528 
529 #ifndef XNNPACK_BENCHMARK_NO_MAIN
530 BENCHMARK_MAIN();
531 #endif
532