1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // A C++ binary to benchmark a compute graph and its individual operators,
17 // both on desktop machines and on Android.
18 //
19 // See README.md for usage instructions.
20
21 #include "tensorflow/tools/benchmark/benchmark_model.h"
22
23 #include <cstdlib>
24 #include <memory>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28
29 #include "tensorflow/core/common_runtime/graph_constructor.h"
30 #include "tensorflow/core/framework/graph.pb.h"
31 #include "tensorflow/core/framework/node_def.pb.h"
32 #include "tensorflow/core/framework/step_stats.pb.h"
33 #include "tensorflow/core/framework/tensor.h"
34 #include "tensorflow/core/graph/algorithm.h"
35 #include "tensorflow/core/graph/graph.h"
36 #include "tensorflow/core/lib/strings/numbers.h"
37 #include "tensorflow/core/lib/strings/str_util.h"
38 #include "tensorflow/core/lib/strings/strcat.h"
39 #include "tensorflow/core/platform/env.h"
40 #include "tensorflow/core/platform/init_main.h"
41 #include "tensorflow/core/platform/logging.h"
42 #include "tensorflow/core/platform/platform.h"
43 #include "tensorflow/core/platform/types.h"
44 #include "tensorflow/core/public/session.h"
45 #include "tensorflow/core/util/command_line_flags.h"
46 #include "tensorflow/core/util/reporter.h"
47 #include "tensorflow/core/util/stat_summarizer.h"
48
49 namespace tensorflow {
50 namespace benchmark_model {
51
52 namespace {
53
InitializeVariables(Session * session,const std::vector<string> & init_ops)54 Status InitializeVariables(Session* session,
55 const std::vector<string>& init_ops) {
56 LOG(INFO) << "Initializing graph variables";
57 for (const string& init_op : init_ops) {
58 TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
59 }
60 return Status::OK();
61 }
62
63 template <class T>
InitializeTensor(const std::vector<float> & initialization_values,Tensor * input_tensor)64 void InitializeTensor(const std::vector<float>& initialization_values,
65 Tensor* input_tensor) {
66 auto type_tensor = input_tensor->flat<T>();
67 type_tensor = type_tensor.constant(0);
68 if (!initialization_values.empty()) {
69 for (int i = 0; i < initialization_values.size(); ++i) {
70 type_tensor(i) = static_cast<T>(initialization_values[i]);
71 }
72 }
73 }
74
CreateTensorsFromInputInfo(const std::vector<InputLayerInfo> & inputs,std::vector<std::pair<string,tensorflow::Tensor>> * input_tensors)75 void CreateTensorsFromInputInfo(
76 const std::vector<InputLayerInfo>& inputs,
77 std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
78 for (const InputLayerInfo& input : inputs) {
79 Tensor input_tensor(input.data_type, input.shape);
80 switch (input.data_type) {
81 case DT_INT32: {
82 InitializeTensor<int32>(input.initialization_values, &input_tensor);
83 break;
84 }
85 case DT_FLOAT: {
86 InitializeTensor<float>(input.initialization_values, &input_tensor);
87 break;
88 }
89 case DT_QUINT8: {
90 InitializeTensor<quint8>(input.initialization_values, &input_tensor);
91 break;
92 }
93 case DT_UINT8: {
94 InitializeTensor<uint8>(input.initialization_values, &input_tensor);
95 break;
96 }
97 case DT_BOOL: {
98 InitializeTensor<bool>(input.initialization_values, &input_tensor);
99 break;
100 }
101 case DT_STRING: {
102 if (!input.initialization_values.empty()) {
103 LOG(FATAL) << "Initialization values are not supported for strings";
104 }
105 auto type_tensor = input_tensor.flat<tstring>();
106 type_tensor = type_tensor.constant("");
107 break;
108 }
109 default:
110 LOG(FATAL) << "Unsupported input type: "
111 << DataTypeString(input.data_type);
112 }
113 input_tensors->push_back({input.name, input_tensor});
114 }
115 }
116
GetOutputShapes(const std::vector<InputLayerInfo> & inputs,const std::set<string> & wanted_shapes,Session * session,std::unordered_map<string,TensorShape> * node_shapes)117 Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
118 const std::set<string>& wanted_shapes, Session* session,
119 std::unordered_map<string, TensorShape>* node_shapes) {
120 std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
121 CreateTensorsFromInputInfo(inputs, &input_tensors);
122 std::vector<tensorflow::Tensor> output_tensors;
123 std::vector<string> output_tensor_names;
124 for (const string& wanted_shape : wanted_shapes) {
125 bool is_input = false;
126 for (const std::pair<string, tensorflow::Tensor>& input_tensor :
127 input_tensors) {
128 if (input_tensor.first == wanted_shape) {
129 (*node_shapes)[wanted_shape] = input_tensor.second.shape();
130 is_input = true;
131 break;
132 }
133 }
134 if (!is_input) {
135 output_tensor_names.push_back(wanted_shape);
136 }
137 }
138 TF_RETURN_IF_ERROR(
139 session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
140 CHECK_EQ(output_tensors.size(), output_tensor_names.size());
141 for (int i = 0; i < output_tensor_names.size(); ++i) {
142 const string& wanted_shape_name = output_tensor_names[i];
143 const TensorShape& found_shape = output_tensors[i].shape();
144 (*node_shapes)[wanted_shape_name] = found_shape;
145 }
146 return Status::OK();
147 }
148
CalculateFlops(const GraphDef & graph,const std::vector<InputLayerInfo> & inputs,Session * session,int64 * total_flops,std::unordered_map<string,int64> * flops_by_op)149 Status CalculateFlops(const GraphDef& graph,
150 const std::vector<InputLayerInfo>& inputs,
151 Session* session, int64* total_flops,
152 std::unordered_map<string, int64>* flops_by_op) {
153 std::unordered_set<string> floppable_ops = {
154 "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
155 "DepthwiseConv2dNative"};
156
157 std::set<string> wanted_shapes;
158 for (const NodeDef& node : graph.node()) {
159 if (floppable_ops.count(node.op())) {
160 for (const string& input : node.input()) {
161 wanted_shapes.insert(input);
162 }
163 wanted_shapes.insert(node.name());
164 }
165 }
166 std::unordered_map<string, TensorShape> found_shapes;
167 TF_RETURN_IF_ERROR(
168 GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
169
170 *total_flops = 0;
171 for (const NodeDef& node : graph.node()) {
172 if (floppable_ops.count(node.op())) {
173 int64 current_flops = 0;
174 // This is a very crude approximation to FLOPs that only looks at a few
175 // op types that commonly form the bulk of the computation for many
176 // models. It's included here because getting even an approximate value
177 // for FLOPs is still very useful for estimating utilization, versus a
178 // device's theoretical maximum FLOPs/second.
179 if ((node.op() == "Conv2D") || (node.op() == "QuantizedConv2D")) {
180 const TensorShape& filter_shape = found_shapes[node.input(1)];
181 const TensorShape& output_shape = found_shapes[node.name()];
182 int64 filter_height = filter_shape.dim_size(0);
183 int64 filter_width = filter_shape.dim_size(1);
184 int64 filter_in_depth = filter_shape.dim_size(2);
185 int64 output_count = output_shape.num_elements();
186 current_flops =
187 output_count * filter_in_depth * filter_height * filter_width * 2;
188 } else if ((node.op() == "MatMul") || (node.op() == "QuantizedMatMul")) {
189 const bool transpose_a = node.attr().at("transpose_a").b();
190 const TensorShape& a_shape = found_shapes[node.input(0)];
191 const TensorShape& output_shape = found_shapes[node.name()];
192 int64 k;
193 if (transpose_a) {
194 k = a_shape.dim_size(0);
195 } else {
196 k = a_shape.dim_size(1);
197 }
198 int64 output_count = output_shape.num_elements();
199 current_flops = k * output_count * 2;
200 } else if (node.op() == "DepthwiseConv2dNative") {
201 const TensorShape& filter_shape = found_shapes[node.input(1)];
202 const TensorShape& output_shape = found_shapes[node.name()];
203 int64 filter_height = filter_shape.dim_size(0);
204 int64 filter_width = filter_shape.dim_size(1);
205 int64 output_count = output_shape.num_elements();
206 current_flops = output_count * filter_height * filter_width * 2;
207 }
208 (*flops_by_op)[node.op()] += current_flops;
209 *total_flops += current_flops;
210 }
211 }
212 return Status::OK();
213 }
214
RecordBenchmarkEntry(const string & output_prefix,const string & benchmark_name,const string & postfix,int num_runs,double total_time_s,double throughput=-1.0)215 void RecordBenchmarkEntry(const string& output_prefix,
216 const string& benchmark_name, const string& postfix,
217 int num_runs, double total_time_s,
218 double throughput = -1.0) {
219 std::stringstream stream;
220 stream << benchmark_name;
221 if (!postfix.empty()) {
222 stream << "_" << postfix;
223 }
224
225 TestReporter node_reporter(output_prefix, stream.str());
226 TF_QCHECK_OK(node_reporter.Initialize());
227 TF_QCHECK_OK(
228 node_reporter.Benchmark(num_runs, -1.0, total_time_s, throughput));
229 TF_QCHECK_OK(node_reporter.Close());
230 }
231
SleepSeconds(double sleep_seconds)232 void SleepSeconds(double sleep_seconds) {
233 if (sleep_seconds <= 0.0) {
234 return;
235 }
236 #ifdef PLATFORM_WINDOWS
237 Sleep(sleep_seconds * 1000);
238 #else
239 // Convert the inference_delay string into a timespec.
240 timespec req;
241 req.tv_sec = static_cast<time_t>(sleep_seconds);
242 req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
243 nanosleep(&req, nullptr);
244 #endif
245 }
246
247 } // namespace
248
InitializeSession(int num_threads,const string & graph,std::unique_ptr<Session> * session,std::unique_ptr<GraphDef> * graph_def)249 Status InitializeSession(int num_threads, const string& graph,
250 std::unique_ptr<Session>* session,
251 std::unique_ptr<GraphDef>* graph_def) {
252 LOG(INFO) << "Loading TensorFlow.";
253
254 tensorflow::SessionOptions options;
255 tensorflow::ConfigProto& config = options.config;
256 if (num_threads > 0) {
257 config.set_intra_op_parallelism_threads(num_threads);
258 config.set_inter_op_parallelism_threads(num_threads);
259 }
260 LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
261
262 session->reset(tensorflow::NewSession(options));
263 graph_def->reset(new GraphDef());
264 tensorflow::GraphDef tensorflow_graph;
265 Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
266 if (!s.ok()) {
267 s = ReadTextProto(Env::Default(), graph, graph_def->get());
268 }
269
270 if (!s.ok()) {
271 LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
272 return s;
273 }
274
275 s = (*session)->Create(*(graph_def->get()));
276 if (!s.ok()) {
277 LOG(ERROR) << "Could not create TensorFlow Session: " << s;
278 return s;
279 }
280
281 return Status::OK();
282 }
283
RunBenchmark(const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * inference_time_us)284 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
285 const std::vector<string>& outputs,
286 const std::vector<string>& targets, Session* session,
287 StatSummarizer* stats, int64* inference_time_us) {
288 std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
289 CreateTensorsFromInputInfo(inputs, &input_tensors);
290
291 std::vector<tensorflow::Tensor> output_tensors;
292
293 tensorflow::Status s;
294
295 RunOptions run_options;
296 if (stats != nullptr) {
297 run_options.set_trace_level(RunOptions::FULL_TRACE);
298 }
299
300 RunMetadata run_metadata;
301 const int64 start_time = Env::Default()->NowMicros();
302 s = session->Run(run_options, input_tensors, outputs, targets,
303 &output_tensors, &run_metadata);
304 const int64 end_time = Env::Default()->NowMicros();
305 *inference_time_us = end_time - start_time;
306
307 if (!s.ok()) {
308 LOG(ERROR) << "Error during inference: " << s;
309 return s;
310 }
311
312 if (stats != nullptr) {
313 assert(run_metadata.has_step_stats());
314 const StepStats& step_stats = run_metadata.step_stats();
315 stats->ProcessStepStats(step_stats);
316 }
317
318 return s;
319 }
320
TimeMultipleRuns(double sleep_seconds,int num_runs,double max_time_s,const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * total_time_us,int64 * actual_num_runs)321 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
322 const std::vector<InputLayerInfo>& inputs,
323 const std::vector<string>& outputs,
324 const std::vector<string>& targets, Session* session,
325 StatSummarizer* stats, int64* total_time_us,
326 int64* actual_num_runs) {
327 *total_time_us = 0;
328
329 LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
330 << max_time_s << " seconds "
331 << (stats != nullptr ? "with" : "without")
332 << " detailed stat logging, with " << sleep_seconds
333 << "s sleep between inferences";
334
335 Stat<int64> stat;
336 const bool until_max_time = num_runs <= 0;
337 for (int i = 0; until_max_time || i < num_runs; ++i) {
338 int64 time;
339 Status run_status =
340 RunBenchmark(inputs, outputs, targets, session, stats, &time);
341 stat.UpdateStat(time);
342 (*total_time_us) += time;
343 ++(*actual_num_runs);
344
345 if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
346 break;
347 }
348
349 if (!run_status.ok()) {
350 LOG(INFO) << "Failed on run " << i;
351 return run_status;
352 }
353
354 // If requested, sleep between runs for an arbitrary amount of time.
355 // This can be helpful to determine the effect of mobile processor
356 // scaling and thermal throttling.
357 if (sleep_seconds > 0.0) {
358 SleepSeconds(sleep_seconds);
359 }
360 }
361 std::stringstream stream;
362 stat.OutputToStream(&stream);
363 LOG(INFO) << stream.str() << std::endl;
364
365 return Status::OK();
366 }
367
Main(int argc,char ** argv)368 int Main(int argc, char** argv) {
369 string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
370 string init_ops_string = "";
371 string input_layer_string = "input:0";
372 string input_layer_shape_string = "1,224,224,3";
373 string input_layer_type_string = "float";
374 string input_layer_values_string = "";
375 string output_layer_string = "output:0";
376 string target_layer_string = "";
377 int max_num_runs = 1000;
378 string max_time = "10.0";
379 string inference_delay = "-1.0";
380 string inter_benchmark_delay = "-1.0";
381 int num_threads = -1;
382 string benchmark_name = "";
383 string output_prefix = "";
384 bool show_sizes = false;
385 bool show_run_order = true;
386 int run_order_limit = 0;
387 bool show_time = true;
388 int time_limit = 10;
389 bool show_memory = true;
390 int memory_limit = 10;
391 bool show_type = true;
392 bool show_summary = true;
393 bool show_flops = false;
394 int warmup_runs = 1;
395
396 std::vector<Flag> flag_list = {
397 Flag("graph", &graph, "graph file name"),
398 Flag("init_ops", &init_ops_string, "init ops"),
399 Flag("input_layer", &input_layer_string, "input layer names"),
400 Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
401 Flag("input_layer_type", &input_layer_type_string, "input layer type"),
402 Flag("input_layer_values", &input_layer_values_string,
403 "values to initialize the inputs with"),
404 Flag("output_layer", &output_layer_string, "output layer name"),
405 Flag("target_layer", &target_layer_string, "target layer name"),
406 Flag("max_num_runs", &max_num_runs, "number of runs max"),
407 Flag("max_time", &max_time, "length to run max"),
408 Flag("inference_delay", &inference_delay,
409 "delay between runs in seconds"),
410 Flag("inter_benchmark_delay", &inter_benchmark_delay,
411 "delay between benchmarks in seconds"),
412 Flag("num_threads", &num_threads, "number of threads"),
413 Flag("benchmark_name", &benchmark_name, "benchmark name"),
414 Flag("output_prefix", &output_prefix, "benchmark output prefix"),
415 Flag("show_sizes", &show_sizes, "whether to show sizes"),
416 Flag("show_run_order", &show_run_order,
417 "whether to list stats by run order"),
418 Flag("run_order_limit", &run_order_limit,
419 "how many items to show by run order"),
420 Flag("show_time", &show_time, "whether to list stats by time taken"),
421 Flag("time_limit", &time_limit, "how many items to show by time taken"),
422 Flag("show_memory", &show_memory, "whether to list stats by memory used"),
423 Flag("memory_limit", &memory_limit,
424 "how many items to show by memory used"),
425 Flag("show_type", &show_type, "whether to list stats by op type"),
426 Flag("show_summary", &show_summary,
427 "whether to show a summary of the stats"),
428 Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
429 Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
430 };
431 string usage = Flags::Usage(argv[0], flag_list);
432 const bool parse_result = Flags::Parse(&argc, argv, flag_list);
433
434 if (!parse_result) {
435 LOG(ERROR) << usage;
436 return -1;
437 }
438
439 std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
440 std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
441 std::vector<string> input_layer_shapes =
442 str_util::Split(input_layer_shape_string, ':');
443 std::vector<string> input_layer_types =
444 str_util::Split(input_layer_type_string, ',');
445 std::vector<string> input_layer_values =
446 str_util::Split(input_layer_values_string, ':');
447 std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
448 std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
449 if ((input_layers.size() != input_layer_shapes.size()) ||
450 (input_layers.size() != input_layer_types.size())) {
451 LOG(ERROR) << "There must be the same number of items in --input_layer,"
452 << " --input_layer_shape, and --input_layer_type, for example"
453 << " --input_layer=input1,input2 --input_layer_type=float,float "
454 << " --input_layer_shape=1,224,224,4:1,20";
455 LOG(ERROR) << "--input_layer=" << input_layer_string << " ("
456 << input_layers.size() << " items)";
457 LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " ("
458 << input_layer_types.size() << " items)";
459 LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " ("
460 << input_layer_shapes.size() << " items)";
461 return -1;
462 }
463 const size_t inputs_count = input_layers.size();
464
465 ::tensorflow::port::InitMain(argv[0], &argc, &argv);
466 if (argc > 1) {
467 LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
468 return -1;
469 }
470
471 LOG(INFO) << "Graph: [" << graph << "]";
472 LOG(INFO) << "Init ops:" << init_ops_string;
473 LOG(INFO) << "Input layers: [" << input_layer_string << "]";
474 LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
475 LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
476 LOG(INFO) << "Output layers: [" << output_layer_string << "]";
477 LOG(INFO) << "Target layers: [" << target_layer_string << "]";
478 LOG(INFO) << "Num runs: [" << max_num_runs << "]";
479 LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
480 LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
481 << "]";
482 LOG(INFO) << "Num threads: [" << num_threads << "]";
483 LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
484 LOG(INFO) << "Output prefix: [" << output_prefix << "]";
485 LOG(INFO) << "Show sizes: [" << show_sizes << "]";
486 LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
487
488 std::unique_ptr<Session> session;
489 std::unique_ptr<StatSummarizer> stats;
490 std::unique_ptr<GraphDef> graph_def;
491
492 int64 initialization_start_us = Env::Default()->NowMicros();
493 Status initialize_status =
494 InitializeSession(num_threads, graph, &session, &graph_def);
495 int64 initialization_end_us = Env::Default()->NowMicros();
496 double initialization_time_s =
497 (initialization_end_us - initialization_start_us) / 1000000.0;
498 LOG(INFO) << "Initialized session in " << initialization_time_s << "s";
499 if (!initialize_status.ok()) {
500 return -1;
501 }
502
503 if (!init_ops.empty()) {
504 Status initialize_variables_status =
505 InitializeVariables(session.get(), init_ops);
506 if (!initialize_variables_status.ok()) {
507 LOG(ERROR) << "Graph variables initialization failed with "
508 << initialize_variables_status;
509 return -1;
510 }
511 }
512
513 StatSummarizerOptions stats_options;
514 stats_options.show_run_order = show_run_order;
515 stats_options.run_order_limit = run_order_limit;
516 stats_options.show_time = show_time;
517 stats_options.time_limit = time_limit;
518 stats_options.show_memory = show_memory;
519 stats_options.memory_limit = memory_limit;
520 stats_options.show_type = show_type;
521 stats_options.show_summary = show_summary;
522 stats.reset(new tensorflow::StatSummarizer(stats_options));
523
524 const double inter_inference_sleep_seconds =
525 std::strtod(inference_delay.c_str(), nullptr);
526 const double inter_benchmark_sleep_seconds =
527 std::strtod(inter_benchmark_delay.c_str(), nullptr);
528 const double max_benchmark_time_seconds =
529 std::strtod(max_time.c_str(), nullptr);
530
531 std::vector<InputLayerInfo> inputs;
532 for (int n = 0; n < inputs_count; ++n) {
533 InputLayerInfo input;
534 CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
535 << input_layer_types[n] << " was an invalid type";
536
537 std::vector<string> split_layer_shapes =
538 str_util::Split(input_layer_shapes[n], ',');
539 for (const string& layer_shape : split_layer_shapes) {
540 int32 tmp;
541 CHECK(strings::safe_strto32(layer_shape, &tmp))
542 << "Incorrect size string specified: " << input_layer_shapes[n];
543 if (tmp == -1) {
544 LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
545 << " with the size you want to benchmark with.";
546 return -1;
547 } else {
548 input.shape.AddDim(tmp);
549 }
550 }
551 input.name = input_layers[n];
552 if (n < input_layer_values.size()) {
553 std::vector<string> string_tokens =
554 str_util::Split(input_layer_values[n], ',');
555 input.initialization_values.clear();
556 input.initialization_values.reserve(string_tokens.size());
557 for (const string& str_val : string_tokens) {
558 float val;
559 CHECK(strings::safe_strtof(str_val, &val))
560 << "Incorrect initialization values string specified: "
561 << input_layer_values[n];
562 input.initialization_values.push_back(val);
563 }
564 }
565 inputs.push_back(input);
566 }
567
568 // If requested, run through the graph first to preinitialize everything
569 // before the benchmarking runs.
570 int64 warmup_time_us = 0;
571 int64 num_warmup_runs = 0;
572 if (warmup_runs > 0) {
573 Status warmup_time_status =
574 TimeMultipleRuns(inter_inference_sleep_seconds, warmup_runs, -1.0,
575 inputs, output_layers, target_layers, session.get(),
576 nullptr, &warmup_time_us, &num_warmup_runs);
577 if (!warmup_time_status.ok()) {
578 LOG(ERROR) << "Timing failed with " << warmup_time_status;
579 return -1;
580 }
581 }
582
583 // Capture overall inference time without stat logging overhead. This is the
584 // timing data that can be compared to other libraries.
585 SleepSeconds(inter_benchmark_sleep_seconds);
586 int64 no_stat_time_us = 0;
587 int64 no_stat_num_runs = 0;
588 Status no_stat_time_status = TimeMultipleRuns(
589 inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
590 inputs, output_layers, target_layers, session.get(), nullptr,
591 &no_stat_time_us, &no_stat_num_runs);
592 const double no_stat_wall_time = no_stat_time_us / 1000000.0;
593 if (!no_stat_time_status.ok()) {
594 LOG(ERROR) << "Timing failed with " << no_stat_time_status;
595 return -1;
596 }
597
598 // Run again to gather detailed log stats to get a better idea of where
599 // relative time is going within the graph.
600 SleepSeconds(inter_benchmark_sleep_seconds);
601 int64 stat_time_us = 0;
602 int64 stat_num_runs = 0;
603 Status stat_time_status = TimeMultipleRuns(
604 inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
605 inputs, output_layers, target_layers, session.get(), stats.get(),
606 &stat_time_us, &stat_num_runs);
607 if (!stat_time_status.ok()) {
608 LOG(ERROR) << "Timing failed with " << stat_time_status;
609 return -1;
610 }
611
612 LOG(INFO) << "Average inference timings in us: "
613 << "Warmup: "
614 << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
615 << "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
616 << "with stats: " << stat_time_us / stat_num_runs;
617
618 stats->PrintStepStats();
619
620 if (show_sizes) {
621 stats->PrintOutputs();
622 }
623
624 if (show_flops) {
625 int64 total_flops;
626 std::unordered_map<string, int64> flops_by_op;
627 Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
628 &total_flops, &flops_by_op);
629 if (!flop_status.ok()) {
630 LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
631 return -1;
632 }
633 string pretty_flops;
634 if (total_flops < 1000) {
635 pretty_flops = strings::StrCat(total_flops, " FLOPs");
636 } else if (total_flops < (1000 * 1000)) {
637 const float rounded_flops = (total_flops / 1000.0f);
638 pretty_flops = strings::StrCat(rounded_flops, "k FLOPs");
639 } else if (total_flops < (1000 * 1000 * 1000)) {
640 const float rounded_flops = round(total_flops / 1000.0f) / 1000.0f;
641 pretty_flops = strings::StrCat(rounded_flops, " million FLOPs");
642 } else {
643 const float rounded_flops =
644 round(total_flops / (1000.0f * 1000.0f)) / 1000.0f;
645 pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
646 }
647 LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
648 const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
649 LOG(INFO) << "FLOPs/second: "
650 << strings::HumanReadableNum(
651 static_cast<int64>(total_flops / mean_run_time));
652 }
653
654 if (!benchmark_name.empty() && !output_prefix.empty()) {
655 // Compute the total number of values per input.
656 int64 total_size = inputs[0].shape.num_elements();
657
658 // Throughput in MB/s
659 const double throughput =
660 DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
661 static_cast<double>(no_stat_wall_time) / (1024 * 1024);
662
663 // Report the stats.
664 RecordBenchmarkEntry(output_prefix, benchmark_name, "", no_stat_num_runs,
665 no_stat_wall_time, throughput);
666
667 // Session initialization time.
668 RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-init", 1,
669 initialization_time_s);
670
671 // First inference time. Note: if warmup_runs is > 1 this will actually be
672 // an average of all the warmup runs.
673 RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
674 warmup_runs, warmup_time_us / 1000000.0);
675
676 // Time from starting to initialize TF to getting the first result back.
677 // This also assumes that only one warmup run is performed.
678 RecordBenchmarkEntry(
679 output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
680 initialization_time_s + (warmup_time_us / 1000000.0) / warmup_runs);
681
682 std::map<std::string, int64_t> node_type_map_count;
683 std::map<std::string, int64_t> node_type_map_time;
684 std::map<std::string, int64_t> node_type_map_memory;
685 std::map<std::string, int64_t> node_type_map_times_called;
686
687 int64_t accumulated_us;
688 stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
689 &node_type_map_memory,
690 &node_type_map_times_called, &accumulated_us);
691 for (const auto& time : node_type_map_time) {
692 LOG(INFO) << "Outputting: [" << time.first << "]";
693 RecordBenchmarkEntry(output_prefix, benchmark_name, time.first,
694 stat_num_runs,
695 (time.second * stat_num_runs) / 1000000.0f);
696 }
697 }
698
699 return 0;
700 }
701
702 } // namespace benchmark_model
703 } // namespace tensorflow
704