1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <array>
11 #include <cmath>
12 #include <functional>
13 #include <limits>
14 #include <random>
15 #include <vector>
16
17 #include <xnnpack.h>
18
19 #include <benchmark/benchmark.h>
20 #include "bench/utils.h"
21 #ifdef BENCHMARK_TENSORFLOW_LITE
22 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
23 #include "tensorflow/lite/interpreter.h"
24 #include "tensorflow/lite/kernels/register.h"
25 #include "tensorflow/lite/model.h"
26 #include "tensorflow/lite/schema/schema_generated.h"
27 #include "tensorflow/lite/version.h"
28 #endif // BENCHMARK_TENSORFLOW_LITE
29
30
31 #ifndef XNN_NO_QU8_OPERATORS
xnnpack_sigmoid_qu8(benchmark::State & state)32 static void xnnpack_sigmoid_qu8(benchmark::State& state) {
33 const size_t batch_size = state.range(0);
34 const size_t channels = state.range(1);
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto u8rng = std::bind(
39 std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
40
41 std::vector<uint8_t> input(batch_size * channels);
42 std::vector<uint8_t> output(batch_size * channels);
43 std::generate(input.begin(), input.end(), std::ref(u8rng));
44 std::fill(output.begin(), output.end(), 0xA5);
45
46 xnn_status status = xnn_initialize(nullptr /* allocator */);
47 if (status != xnn_status_success) {
48 state.SkipWithError("failed to initialize XNNPACK");
49 return;
50 }
51
52 xnn_operator_t sigmoid_op = nullptr;
53 status = xnn_create_sigmoid_nc_qu8(
54 channels, channels /* input stride */, channels /* output stride */,
55 127 /* input zero point */, 1.0f /* input scale */,
56 0 /* output zero point */, 1.0f / 256.0f /* output scale */,
57 0 /* output min */, 255 /* output max */,
58 0 /* flags */, &sigmoid_op);
59 if (status != xnn_status_success || sigmoid_op == nullptr) {
60 state.SkipWithError("failed to create Sigmoid operator");
61 return;
62 }
63
64 status = xnn_setup_sigmoid_nc_qu8(
65 sigmoid_op,
66 batch_size,
67 input.data(), output.data(),
68 nullptr /* thread pool */);
69 if (status != xnn_status_success) {
70 state.SkipWithError("failed to setup Sigmoid operator");
71 return;
72 }
73
74 for (auto _ : state) {
75 status = xnn_run_operator(sigmoid_op, nullptr /* thread pool */);
76 if (status != xnn_status_success) {
77 state.SkipWithError("failed to run Sigmoid operator");
78 return;
79 }
80 }
81
82 status = xnn_delete_operator(sigmoid_op);
83 if (status != xnn_status_success) {
84 state.SkipWithError("failed to delete Sigmoid operator");
85 return;
86 }
87
88 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
89 if (cpu_frequency != 0) {
90 state.counters["cpufreq"] = cpu_frequency;
91 }
92
93 const size_t elements_per_iteration = batch_size * channels;
94 state.counters["elements"] =
95 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
96
97 const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(uint8_t);
98 state.counters["bytes"] =
99 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
100 }
101 #endif // XNN_NO_QU8_OPERATORS
102
xnnpack_sigmoid_f32(benchmark::State & state)103 static void xnnpack_sigmoid_f32(benchmark::State& state) {
104 const size_t batch_size = state.range(0);
105 const size_t channels = state.range(1);
106
107 std::random_device random_device;
108 auto rng = std::mt19937(random_device());
109 auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
110
111 std::vector<float> input(batch_size * channels);
112 std::vector<float> output(batch_size * channels);
113 std::generate(input.begin(), input.end(), std::ref(f32rng));
114 std::fill(output.begin(), output.end(), std::nanf(""));
115
116 xnn_status status = xnn_initialize(nullptr /* allocator */);
117 if (status != xnn_status_success) {
118 state.SkipWithError("failed to initialize XNNPACK");
119 return;
120 }
121
122 xnn_operator_t sigmoid_op = nullptr;
123 status = xnn_create_sigmoid_nc_f32(
124 channels, channels /* input stride */, channels /* output stride */,
125 0 /* flags */, &sigmoid_op);
126 if (status != xnn_status_success || sigmoid_op == nullptr) {
127 state.SkipWithError("failed to create Sigmoid operator");
128 return;
129 }
130
131 status = xnn_setup_sigmoid_nc_f32(
132 sigmoid_op,
133 batch_size,
134 input.data(), output.data(),
135 nullptr /* thread pool */);
136 if (status != xnn_status_success) {
137 state.SkipWithError("failed to setup Sigmoid operator");
138 return;
139 }
140
141 for (auto _ : state) {
142 status = xnn_run_operator(sigmoid_op, nullptr /* thread pool */);
143 if (status != xnn_status_success) {
144 state.SkipWithError("failed to run Sigmoid operator");
145 return;
146 }
147 }
148
149 status = xnn_delete_operator(sigmoid_op);
150 if (status != xnn_status_success) {
151 state.SkipWithError("failed to delete Sigmoid operator");
152 return;
153 }
154
155 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
156 if (cpu_frequency != 0) {
157 state.counters["cpufreq"] = cpu_frequency;
158 }
159
160 const size_t elements_per_iteration = batch_size * channels;
161 state.counters["elements"] =
162 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
163
164 const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(float);
165 state.counters["bytes"] =
166 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
167 }
168
169 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_sigmoid_f32(benchmark::State & state)170 static void tflite_sigmoid_f32(benchmark::State& state) {
171 const size_t batch_size = state.range(0);
172 const size_t channels = state.range(1);
173
174 std::random_device random_device;
175 auto rng = std::mt19937(random_device());
176 auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
177
178 flatbuffers::FlatBufferBuilder builder;
179 const flatbuffers::Offset<tflite::OperatorCode> operator_code =
180 CreateOperatorCode(builder, tflite::BuiltinOperator_LOGISTIC);
181
182 const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
183 tflite::CreateBuffer(builder, builder.CreateVector({})),
184 }};
185
186 const std::array<int32_t, 4> input_shape{{
187 static_cast<int32_t>(batch_size),
188 static_cast<int32_t>(1 /* height */),
189 static_cast<int32_t>(1 /* width */),
190 static_cast<int32_t>(channels)
191 }};
192 const std::array<int32_t, 4> output_shape{{
193 static_cast<int32_t>(batch_size),
194 static_cast<int32_t>(1 /* height */),
195 static_cast<int32_t>(1 /* width */),
196 static_cast<int32_t>(channels)
197 }};
198
199 const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
200 tflite::CreateTensor(builder,
201 builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
202 tflite::TensorType_FLOAT32),
203 tflite::CreateTensor(builder,
204 builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
205 tflite::TensorType_FLOAT32),
206 }};
207
208 const std::array<int32_t, 1> op_inputs{{ 0 }};
209 const std::array<int32_t, 1> op_outputs{{ 1 }};
210 flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
211 builder,
212 0 /* opcode_index */,
213 builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
214 builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
215
216 const std::array<int32_t, 1> graph_inputs{{ 0 }};
217 const std::array<int32_t, 1> graph_outputs{{ 1 }};
218 const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
219 builder,
220 builder.CreateVector(tensors.data(), tensors.size()),
221 builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
222 builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
223 builder.CreateVector(&op, 1));
224
225 const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
226 TFLITE_SCHEMA_VERSION,
227 builder.CreateVector(&operator_code, 1),
228 builder.CreateVector(&subgraph, 1),
229 builder.CreateString("Sigmoid model"),
230 builder.CreateVector(buffers.data(), buffers.size()));
231
232 builder.Finish(model_buffer);
233
234 const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
235 tflite::ops::builtin::BuiltinOpResolver resolver;
236 tflite::InterpreterBuilder interpreterBuilder(model, resolver);
237 std::unique_ptr<tflite::Interpreter> interpreter;
238 if (interpreterBuilder(&interpreter) != kTfLiteOk) {
239 state.SkipWithError("failed to create TFLite interpreter");
240 return;
241 }
242 if (interpreter == nullptr) {
243 state.SkipWithError("TFLite interpreter is null");
244 return;
245 }
246 interpreter->SetNumThreads(1);
247
248 if (interpreter->AllocateTensors() != kTfLiteOk) {
249 state.SkipWithError("failed to allocate tensors");
250 return;
251 }
252
253 std::generate(
254 interpreter->typed_tensor<float>(0),
255 interpreter->typed_tensor<float>(0) + batch_size * channels,
256 std::ref(f32rng));
257
258 for (auto _ : state) {
259 if (interpreter->Invoke() != kTfLiteOk) {
260 state.SkipWithError("failed to invoke TFLite interpreter");
261 return;
262 }
263 }
264
265 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
266 if (cpu_frequency != 0) {
267 state.counters["cpufreq"] = cpu_frequency;
268 }
269
270 const size_t elements_per_iteration = batch_size * channels;
271 state.counters["elements"] =
272 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
273
274 const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(float);
275 state.counters["bytes"] =
276 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
277
278 interpreter.reset();
279 }
280 #endif // BENCHMARK_TENSORFLOW_LITE
281
CharacteristicArguments(benchmark::internal::Benchmark * b)282 static void CharacteristicArguments(benchmark::internal::Benchmark* b)
283 {
284 b->ArgNames({"N", "C"});
285
286 int32_t c = 16;
287 for (int32_t n = 224; n >= 7; n /= 2) {
288 b->Args({n * n, c});
289 c *= 2;
290 }
291 }
292
293 #ifndef XNN_NO_QU8_OPERATORS
294 BENCHMARK(xnnpack_sigmoid_qu8)->Apply(CharacteristicArguments)->UseRealTime();
295 #endif // XNN_NO_QU8_OPERATORS
296 BENCHMARK(xnnpack_sigmoid_f32)->Apply(CharacteristicArguments)->UseRealTime();
297
298 #ifdef BENCHMARK_TENSORFLOW_LITE
299 BENCHMARK(tflite_sigmoid_f32)->Apply(CharacteristicArguments)->UseRealTime();
300 #endif // BENCHMARK_TENSORFLOW_LITE
301
302 #ifndef XNNPACK_BENCHMARK_NO_MAIN
303 BENCHMARK_MAIN();
304 #endif
305