1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // A C++ binary to benchmark a compute graph and its individual operators,
17 // both on desktop machines and on Android.
18 //
19 // See README.md for usage instructions.
20
21 #include "tensorflow/tools/benchmark/benchmark_model.h"
22
23 #include <cstdlib>
24 #include <memory>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28
29 #include "tensorflow/core/framework/graph.pb.h"
30 #include "tensorflow/core/framework/node_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.h"
33 #include "tensorflow/core/graph/algorithm.h"
34 #include "tensorflow/core/graph/graph.h"
35 #include "tensorflow/core/graph/graph_constructor.h"
36 #include "tensorflow/core/lib/strings/str_util.h"
37 #include "tensorflow/core/lib/strings/strcat.h"
38 #include "tensorflow/core/platform/env.h"
39 #include "tensorflow/core/platform/init_main.h"
40 #include "tensorflow/core/platform/logging.h"
41 #include "tensorflow/core/platform/platform.h"
42 #include "tensorflow/core/platform/types.h"
43 #include "tensorflow/core/public/session.h"
44 #include "tensorflow/core/util/command_line_flags.h"
45 #include "tensorflow/core/util/reporter.h"
46 #include "tensorflow/core/util/stat_summarizer.h"
47
48 namespace tensorflow {
49 namespace benchmark_model {
50
51 namespace {
52
InitializeVariables(Session * session,const std::vector<string> & init_ops)53 Status InitializeVariables(Session* session,
54 const std::vector<string>& init_ops) {
55 LOG(INFO) << "Initializing graph variables";
56 for (const string& init_op : init_ops) {
57 TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
58 }
59 return Status::OK();
60 }
61
62 template <class T>
InitializeTensor(const std::vector<float> & initialization_values,Tensor * input_tensor)63 void InitializeTensor(const std::vector<float>& initialization_values,
64 Tensor* input_tensor) {
65 auto type_tensor = input_tensor->flat<T>();
66 type_tensor = type_tensor.constant(0);
67 if (!initialization_values.empty()) {
68 for (int i = 0; i < initialization_values.size(); ++i) {
69 type_tensor(i) = static_cast<T>(initialization_values[i]);
70 }
71 }
72 }
73
CreateTensorsFromInputInfo(const std::vector<InputLayerInfo> & inputs,std::vector<std::pair<string,tensorflow::Tensor>> * input_tensors)74 void CreateTensorsFromInputInfo(
75 const std::vector<InputLayerInfo>& inputs,
76 std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
77 for (const InputLayerInfo& input : inputs) {
78 Tensor input_tensor(input.data_type, input.shape);
79 switch (input.data_type) {
80 case DT_INT32: {
81 InitializeTensor<int32>(input.initialization_values, &input_tensor);
82 break;
83 }
84 case DT_FLOAT: {
85 InitializeTensor<float>(input.initialization_values, &input_tensor);
86 break;
87 }
88 case DT_QUINT8: {
89 InitializeTensor<quint8>(input.initialization_values, &input_tensor);
90 break;
91 }
92 case DT_UINT8: {
93 InitializeTensor<uint8>(input.initialization_values, &input_tensor);
94 break;
95 }
96 case DT_BOOL: {
97 InitializeTensor<bool>(input.initialization_values, &input_tensor);
98 break;
99 }
100 case DT_STRING: {
101 if (!input.initialization_values.empty()) {
102 LOG(FATAL) << "Initialization values are not supported for strings";
103 }
104 auto type_tensor = input_tensor.flat<string>();
105 type_tensor = type_tensor.constant("");
106 break;
107 }
108 default:
109 LOG(FATAL) << "Unsupported input type: "
110 << DataTypeString(input.data_type);
111 }
112 input_tensors->push_back({input.name, input_tensor});
113 }
114 }
115
GetOutputShapes(const std::vector<InputLayerInfo> & inputs,const std::set<string> & wanted_shapes,Session * session,std::unordered_map<string,TensorShape> * node_shapes)116 Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
117 const std::set<string>& wanted_shapes, Session* session,
118 std::unordered_map<string, TensorShape>* node_shapes) {
119 std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
120 CreateTensorsFromInputInfo(inputs, &input_tensors);
121 std::vector<tensorflow::Tensor> output_tensors;
122 std::vector<string> output_tensor_names;
123 for (const string& wanted_shape : wanted_shapes) {
124 bool is_input = false;
125 for (const std::pair<string, tensorflow::Tensor>& input_tensor :
126 input_tensors) {
127 if (input_tensor.first == wanted_shape) {
128 (*node_shapes)[wanted_shape] = input_tensor.second.shape();
129 is_input = true;
130 break;
131 }
132 }
133 if (!is_input) {
134 output_tensor_names.push_back(wanted_shape);
135 }
136 }
137 TF_RETURN_IF_ERROR(
138 session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
139 CHECK_EQ(output_tensors.size(), output_tensor_names.size());
140 for (int i = 0; i < output_tensor_names.size(); ++i) {
141 const string& wanted_shape_name = output_tensor_names[i];
142 const TensorShape& found_shape = output_tensors[i].shape();
143 (*node_shapes)[wanted_shape_name] = found_shape;
144 }
145 return Status::OK();
146 }
147
CalculateFlops(const GraphDef & graph,const std::vector<InputLayerInfo> & inputs,Session * session,int64 * total_flops,std::unordered_map<string,int64> * flops_by_op)148 Status CalculateFlops(const GraphDef& graph,
149 const std::vector<InputLayerInfo>& inputs,
150 Session* session, int64* total_flops,
151 std::unordered_map<string, int64>* flops_by_op) {
152 std::unordered_set<string> floppable_ops = {
153 "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
154 "DepthwiseConv2dNative"};
155
156 std::set<string> wanted_shapes;
157 for (const NodeDef& node : graph.node()) {
158 if (floppable_ops.count(node.op())) {
159 for (const string& input : node.input()) {
160 wanted_shapes.insert(input);
161 }
162 wanted_shapes.insert(node.name());
163 }
164 }
165 std::unordered_map<string, TensorShape> found_shapes;
166 TF_RETURN_IF_ERROR(
167 GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
168
169 *total_flops = 0;
170 for (const NodeDef& node : graph.node()) {
171 if (floppable_ops.count(node.op())) {
172 int64 current_flops = 0;
173 // This is a very crude approximation to FLOPs that only looks at a few
174 // op types that commonly form the bulk of the computation for many
175 // models. It's included here because getting even an approximate value
176 // for FLOPs is still very useful for estimating utilization, versus a
177 // device's theoretical maximum FLOPs/second.
178 if ((node.op() == "Conv2D") || (node.op() == "QuantizedConv2D")) {
179 const TensorShape& filter_shape = found_shapes[node.input(1)];
180 const TensorShape& output_shape = found_shapes[node.name()];
181 int64 filter_height = filter_shape.dim_size(0);
182 int64 filter_width = filter_shape.dim_size(1);
183 int64 filter_in_depth = filter_shape.dim_size(2);
184 int64 output_count = output_shape.num_elements();
185 current_flops =
186 output_count * filter_in_depth * filter_height * filter_width * 2;
187 } else if ((node.op() == "MatMul") || (node.op() == "QuantizedMatMul")) {
188 const bool transpose_a = node.attr().at("transpose_a").b();
189 const TensorShape& a_shape = found_shapes[node.input(0)];
190 const TensorShape& output_shape = found_shapes[node.name()];
191 int64 k;
192 if (transpose_a) {
193 k = a_shape.dim_size(0);
194 } else {
195 k = a_shape.dim_size(1);
196 }
197 int64 output_count = output_shape.num_elements();
198 current_flops = k * output_count * 2;
199 } else if (node.op() == "DepthwiseConv2dNative") {
200 const TensorShape& filter_shape = found_shapes[node.input(1)];
201 const TensorShape& output_shape = found_shapes[node.name()];
202 int64 filter_height = filter_shape.dim_size(0);
203 int64 filter_width = filter_shape.dim_size(1);
204 int64 output_count = output_shape.num_elements();
205 current_flops = output_count * filter_height * filter_width * 2;
206 }
207 (*flops_by_op)[node.op()] += current_flops;
208 *total_flops += current_flops;
209 }
210 }
211 return Status::OK();
212 }
213
RecordBenchmarkEntry(const string & output_prefix,const string & benchmark_name,const string & postfix,int num_runs,double total_time_s,double throughput=-1.0)214 void RecordBenchmarkEntry(const string& output_prefix,
215 const string& benchmark_name, const string& postfix,
216 int num_runs, double total_time_s,
217 double throughput = -1.0) {
218 std::stringstream stream;
219 stream << benchmark_name;
220 if (!postfix.empty()) {
221 stream << "_" << postfix;
222 }
223
224 TestReporter node_reporter(output_prefix, stream.str());
225 TF_QCHECK_OK(node_reporter.Initialize());
226 TF_QCHECK_OK(
227 node_reporter.Benchmark(num_runs, -1.0, total_time_s, throughput));
228 TF_QCHECK_OK(node_reporter.Close());
229 }
230
SleepSeconds(double sleep_seconds)231 void SleepSeconds(double sleep_seconds) {
232 if (sleep_seconds <= 0.0) {
233 return;
234 }
235 #ifdef PLATFORM_WINDOWS
236 Sleep(sleep_seconds * 1000);
237 #else
238 // Convert the inference_delay string into a timespec.
239 timespec req;
240 req.tv_sec = static_cast<time_t>(sleep_seconds);
241 req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
242 nanosleep(&req, nullptr);
243 #endif
244 }
245
246 } // namespace
247
InitializeSession(int num_threads,const string & graph,std::unique_ptr<Session> * session,std::unique_ptr<GraphDef> * graph_def)248 Status InitializeSession(int num_threads, const string& graph,
249 std::unique_ptr<Session>* session,
250 std::unique_ptr<GraphDef>* graph_def) {
251 LOG(INFO) << "Loading TensorFlow.";
252
253 tensorflow::SessionOptions options;
254 tensorflow::ConfigProto& config = options.config;
255 if (num_threads > 0) {
256 config.set_intra_op_parallelism_threads(num_threads);
257 config.set_inter_op_parallelism_threads(num_threads);
258 }
259 LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
260
261 session->reset(tensorflow::NewSession(options));
262 graph_def->reset(new GraphDef());
263 tensorflow::GraphDef tensorflow_graph;
264 Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
265 if (!s.ok()) {
266 s = ReadTextProto(Env::Default(), graph, graph_def->get());
267 }
268
269 if (!s.ok()) {
270 LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
271 return s;
272 }
273
274 s = (*session)->Create(*(graph_def->get()));
275 if (!s.ok()) {
276 LOG(ERROR) << "Could not create TensorFlow Session: " << s;
277 return s;
278 }
279
280 return Status::OK();
281 }
282
RunBenchmark(const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * inference_time_us)283 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
284 const std::vector<string>& outputs,
285 const std::vector<string>& targets, Session* session,
286 StatSummarizer* stats, int64* inference_time_us) {
287 std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
288 CreateTensorsFromInputInfo(inputs, &input_tensors);
289
290 std::vector<tensorflow::Tensor> output_tensors;
291
292 tensorflow::Status s;
293
294 RunOptions run_options;
295 if (stats != nullptr) {
296 run_options.set_trace_level(RunOptions::FULL_TRACE);
297 }
298
299 RunMetadata run_metadata;
300 const int64 start_time = Env::Default()->NowMicros();
301 s = session->Run(run_options, input_tensors, outputs, targets,
302 &output_tensors, &run_metadata);
303 const int64 end_time = Env::Default()->NowMicros();
304 *inference_time_us = end_time - start_time;
305
306 if (!s.ok()) {
307 LOG(ERROR) << "Error during inference: " << s;
308 return s;
309 }
310
311 if (stats != nullptr) {
312 assert(run_metadata.has_step_stats());
313 const StepStats& step_stats = run_metadata.step_stats();
314 stats->ProcessStepStats(step_stats);
315 }
316
317 return s;
318 }
319
TimeMultipleRuns(double sleep_seconds,int num_runs,double max_time_s,const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * total_time_us,int64 * actual_num_runs)320 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
321 const std::vector<InputLayerInfo>& inputs,
322 const std::vector<string>& outputs,
323 const std::vector<string>& targets, Session* session,
324 StatSummarizer* stats, int64* total_time_us,
325 int64* actual_num_runs) {
326 *total_time_us = 0;
327
328 LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
329 << max_time_s << " seconds "
330 << (stats != nullptr ? "with" : "without")
331 << " detailed stat logging, with " << sleep_seconds
332 << "s sleep between inferences";
333
334 Stat<int64> stat;
335 const bool until_max_time = num_runs <= 0;
336 for (int i = 0; until_max_time || i < num_runs; ++i) {
337 int64 time;
338 Status run_status =
339 RunBenchmark(inputs, outputs, targets, session, stats, &time);
340 stat.UpdateStat(time);
341 (*total_time_us) += time;
342 ++(*actual_num_runs);
343
344 if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
345 break;
346 }
347
348 if (!run_status.ok()) {
349 LOG(INFO) << "Failed on run " << i;
350 return run_status;
351 }
352
353 // If requested, sleep between runs for an arbitrary amount of time.
354 // This can be helpful to determine the effect of mobile processor
355 // scaling and thermal throttling.
356 if (sleep_seconds > 0.0) {
357 SleepSeconds(sleep_seconds);
358 }
359 }
360 std::stringstream stream;
361 stat.OutputToStream(&stream);
362 LOG(INFO) << stream.str() << std::endl;
363
364 return Status::OK();
365 }
366
Main(int argc,char ** argv)367 int Main(int argc, char** argv) {
368 string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
369 string init_ops_string = "";
370 string input_layer_string = "input:0";
371 string input_layer_shape_string = "1,224,224,3";
372 string input_layer_type_string = "float";
373 string input_layer_values_string = "";
374 string output_layer_string = "output:0";
375 string target_layer_string = "";
376 int max_num_runs = 1000;
377 string max_time = "10.0";
378 string inference_delay = "-1.0";
379 string inter_benchmark_delay = "-1.0";
380 int num_threads = -1;
381 string benchmark_name = "";
382 string output_prefix = "";
383 bool show_sizes = false;
384 bool show_run_order = true;
385 int run_order_limit = 0;
386 bool show_time = true;
387 int time_limit = 10;
388 bool show_memory = true;
389 int memory_limit = 10;
390 bool show_type = true;
391 bool show_summary = true;
392 bool show_flops = false;
393 int warmup_runs = 1;
394
395 std::vector<Flag> flag_list = {
396 Flag("graph", &graph, "graph file name"),
397 Flag("init_ops", &init_ops_string, "init ops"),
398 Flag("input_layer", &input_layer_string, "input layer names"),
399 Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
400 Flag("input_layer_type", &input_layer_type_string, "input layer type"),
401 Flag("input_layer_values", &input_layer_values_string,
402 "values to initialize the inputs with"),
403 Flag("output_layer", &output_layer_string, "output layer name"),
404 Flag("target_layer", &target_layer_string, "target layer name"),
405 Flag("max_num_runs", &max_num_runs, "number of runs max"),
406 Flag("max_time", &max_time, "length to run max"),
407 Flag("inference_delay", &inference_delay,
408 "delay between runs in seconds"),
409 Flag("inter_benchmark_delay", &inter_benchmark_delay,
410 "delay between benchmarks in seconds"),
411 Flag("num_threads", &num_threads, "number of threads"),
412 Flag("benchmark_name", &benchmark_name, "benchmark name"),
413 Flag("output_prefix", &output_prefix, "benchmark output prefix"),
414 Flag("show_sizes", &show_sizes, "whether to show sizes"),
415 Flag("show_run_order", &show_run_order,
416 "whether to list stats by run order"),
417 Flag("run_order_limit", &run_order_limit,
418 "how many items to show by run order"),
419 Flag("show_time", &show_time, "whether to list stats by time taken"),
420 Flag("time_limit", &time_limit, "how many items to show by time taken"),
421 Flag("show_memory", &show_memory, "whether to list stats by memory used"),
422 Flag("memory_limit", &memory_limit,
423 "how many items to show by memory used"),
424 Flag("show_type", &show_type, "whether to list stats by op type"),
425 Flag("show_summary", &show_summary,
426 "whether to show a summary of the stats"),
427 Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
428 Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
429 };
430 string usage = Flags::Usage(argv[0], flag_list);
431 const bool parse_result = Flags::Parse(&argc, argv, flag_list);
432
433 if (!parse_result) {
434 LOG(ERROR) << usage;
435 return -1;
436 }
437
438 std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
439 std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
440 std::vector<string> input_layer_shapes =
441 str_util::Split(input_layer_shape_string, ':');
442 std::vector<string> input_layer_types =
443 str_util::Split(input_layer_type_string, ',');
444 std::vector<string> input_layer_values =
445 str_util::Split(input_layer_values_string, ':');
446 std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
447 std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
448 if ((input_layers.size() != input_layer_shapes.size()) ||
449 (input_layers.size() != input_layer_types.size())) {
450 LOG(ERROR) << "There must be the same number of items in --input_layer,"
451 << " --input_layer_shape, and --input_layer_type, for example"
452 << " --input_layer=input1,input2 --input_layer_type=float,float "
453 << " --input_layer_shape=1,224,224,4:1,20";
454 LOG(ERROR) << "--input_layer=" << input_layer_string << " ("
455 << input_layers.size() << " items)";
456 LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " ("
457 << input_layer_types.size() << " items)";
458 LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " ("
459 << input_layer_shapes.size() << " items)";
460 return -1;
461 }
462 const size_t inputs_count = input_layers.size();
463
464 ::tensorflow::port::InitMain(argv[0], &argc, &argv);
465 if (argc > 1) {
466 LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
467 return -1;
468 }
469
470 LOG(INFO) << "Graph: [" << graph << "]";
471 LOG(INFO) << "Init ops:" << init_ops_string;
472 LOG(INFO) << "Input layers: [" << input_layer_string << "]";
473 LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
474 LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
475 LOG(INFO) << "Output layers: [" << output_layer_string << "]";
476 LOG(INFO) << "Target layers: [" << target_layer_string << "]";
477 LOG(INFO) << "Num runs: [" << max_num_runs << "]";
478 LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
479 LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
480 << "]";
481 LOG(INFO) << "Num threads: [" << num_threads << "]";
482 LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
483 LOG(INFO) << "Output prefix: [" << output_prefix << "]";
484 LOG(INFO) << "Show sizes: [" << show_sizes << "]";
485 LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
486
487 std::unique_ptr<Session> session;
488 std::unique_ptr<StatSummarizer> stats;
489 std::unique_ptr<GraphDef> graph_def;
490
491 int64 initialization_start_us = Env::Default()->NowMicros();
492 Status initialize_status =
493 InitializeSession(num_threads, graph, &session, &graph_def);
494 int64 initialization_end_us = Env::Default()->NowMicros();
495 double initialization_time_s =
496 (initialization_end_us - initialization_start_us) / 1000000.0;
497 LOG(INFO) << "Initialized session in " << initialization_time_s << "s";
498 if (!initialize_status.ok()) {
499 return -1;
500 }
501
502 if (!init_ops.empty()) {
503 Status initialize_variables_status =
504 InitializeVariables(session.get(), init_ops);
505 if (!initialize_variables_status.ok()) {
506 LOG(ERROR) << "Graph variables initialization failed with "
507 << initialize_variables_status;
508 return -1;
509 }
510 }
511
512 StatSummarizerOptions stats_options;
513 stats_options.show_run_order = show_run_order;
514 stats_options.run_order_limit = run_order_limit;
515 stats_options.show_time = show_time;
516 stats_options.time_limit = time_limit;
517 stats_options.show_memory = show_memory;
518 stats_options.memory_limit = memory_limit;
519 stats_options.show_type = show_type;
520 stats_options.show_summary = show_summary;
521 stats.reset(new tensorflow::StatSummarizer(stats_options));
522
523 const double inter_inference_sleep_seconds =
524 std::strtod(inference_delay.c_str(), nullptr);
525 const double inter_benchmark_sleep_seconds =
526 std::strtod(inter_benchmark_delay.c_str(), nullptr);
527 const double max_benchmark_time_seconds =
528 std::strtod(max_time.c_str(), nullptr);
529
530 std::vector<InputLayerInfo> inputs;
531 for (int n = 0; n < inputs_count; ++n) {
532 InputLayerInfo input;
533 CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
534 << input_layer_types[n] << " was an invalid type";
535 std::vector<int32> sizes;
536 CHECK(str_util::SplitAndParseAsInts(input_layer_shapes[n], ',', &sizes))
537 << "Incorrect size string specified: " << input_layer_shapes[n];
538 for (int i = 0; i < sizes.size(); ++i) {
539 int32 size = sizes[i];
540 if (size == -1) {
541 LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
542 << " with the size you want to benchmark with.";
543 return -1;
544 }
545 input.shape.AddDim(sizes[i]);
546 }
547 input.name = input_layers[n];
548 if (n < input_layer_values.size()) {
549 CHECK(str_util::SplitAndParseAsFloats(input_layer_values[n], ',',
550 &input.initialization_values))
551 << "Incorrect initialization values string specified: "
552 << input_layer_values[n];
553 }
554 inputs.push_back(input);
555 }
556
557 // If requested, run through the graph first to preinitialize everything
558 // before the benchmarking runs.
559 int64 warmup_time_us = 0;
560 int64 num_warmup_runs = 0;
561 if (warmup_runs > 0) {
562 Status warmup_time_status =
563 TimeMultipleRuns(inter_inference_sleep_seconds, warmup_runs, -1.0,
564 inputs, output_layers, target_layers, session.get(),
565 nullptr, &warmup_time_us, &num_warmup_runs);
566 if (!warmup_time_status.ok()) {
567 LOG(ERROR) << "Timing failed with " << warmup_time_status;
568 return -1;
569 }
570 }
571
572 // Capture overall inference time without stat logging overhead. This is the
573 // timing data that can be compared to other libraries.
574 SleepSeconds(inter_benchmark_sleep_seconds);
575 int64 no_stat_time_us = 0;
576 int64 no_stat_num_runs = 0;
577 Status no_stat_time_status = TimeMultipleRuns(
578 inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
579 inputs, output_layers, target_layers, session.get(), nullptr,
580 &no_stat_time_us, &no_stat_num_runs);
581 const double no_stat_wall_time = no_stat_time_us / 1000000.0;
582 if (!no_stat_time_status.ok()) {
583 LOG(ERROR) << "Timing failed with " << no_stat_time_status;
584 return -1;
585 }
586
587 // Run again to gather detailed log stats to get a better idea of where
588 // relative time is going within the graph.
589 SleepSeconds(inter_benchmark_sleep_seconds);
590 int64 stat_time_us = 0;
591 int64 stat_num_runs = 0;
592 Status stat_time_status = TimeMultipleRuns(
593 inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
594 inputs, output_layers, target_layers, session.get(), stats.get(),
595 &stat_time_us, &stat_num_runs);
596 if (!stat_time_status.ok()) {
597 LOG(ERROR) << "Timing failed with " << stat_time_status;
598 return -1;
599 }
600
601 LOG(INFO) << "Average inference timings in us: "
602 << "Warmup: "
603 << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
604 << "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
605 << "with stats: " << stat_time_us / stat_num_runs;
606
607 stats->PrintStepStats();
608
609 if (show_sizes) {
610 stats->PrintOutputs();
611 }
612
613 if (show_flops) {
614 int64 total_flops;
615 std::unordered_map<string, int64> flops_by_op;
616 Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
617 &total_flops, &flops_by_op);
618 if (!flop_status.ok()) {
619 LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
620 return -1;
621 }
622 string pretty_flops;
623 if (total_flops < 1000) {
624 pretty_flops = strings::StrCat(total_flops, " FLOPs");
625 } else if (total_flops < (1000 * 1000)) {
626 const float rounded_flops = (total_flops / 1000.0f);
627 pretty_flops = strings::StrCat(rounded_flops, "k FLOPs");
628 } else if (total_flops < (1000 * 1000 * 1000)) {
629 const float rounded_flops = round(total_flops / 1000.0f) / 1000.0f;
630 pretty_flops = strings::StrCat(rounded_flops, " million FLOPs");
631 } else {
632 const float rounded_flops =
633 round(total_flops / (1000.0f * 1000.0f)) / 1000.0f;
634 pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
635 }
636 LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
637 const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
638 LOG(INFO) << "FLOPs/second: "
639 << strings::HumanReadableNum(
640 static_cast<int64>(total_flops / mean_run_time));
641 }
642
643 if (!benchmark_name.empty() && !output_prefix.empty()) {
644 // Compute the total number of values per input.
645 int64 total_size = inputs[0].shape.num_elements();
646
647 // Throughput in MB/s
648 const double throughput =
649 DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
650 static_cast<double>(no_stat_wall_time) / (1024 * 1024);
651
652 // Report the stats.
653 RecordBenchmarkEntry(output_prefix, benchmark_name, "", no_stat_num_runs,
654 no_stat_wall_time, throughput);
655
656 // Session initialization time.
657 RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-init", 1,
658 initialization_time_s);
659
660 // First inference time. Note: if warmup_runs is > 1 this will actually be
661 // an average of all the warmup runs.
662 RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
663 warmup_runs, warmup_time_us / 1000000.0);
664
665 // Time from starting to initialize TF to getting the first result back.
666 // This also assumes that only one warmup run is performed.
667 RecordBenchmarkEntry(
668 output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
669 initialization_time_s + (warmup_time_us / 1000000.0) / warmup_runs);
670
671 std::map<std::string, int64_t> node_type_map_count;
672 std::map<std::string, int64_t> node_type_map_time;
673 std::map<std::string, int64_t> node_type_map_memory;
674 std::map<std::string, int64_t> node_type_map_times_called;
675
676 int64_t accumulated_us;
677 stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
678 &node_type_map_memory,
679 &node_type_map_times_called, &accumulated_us);
680 for (const auto& time : node_type_map_time) {
681 LOG(INFO) << "Outputting: [" << time.first << "]";
682 RecordBenchmarkEntry(output_prefix, benchmark_name, time.first,
683 stat_num_runs,
684 (time.second * stat_num_runs) / 1000000.0f);
685 }
686 }
687
688 return 0;
689 }
690
691 } // namespace benchmark_model
692 } // namespace tensorflow
693