• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // A C++ binary to benchmark a compute graph and its individual operators,
17 // both on desktop machines and on Android.
18 //
19 // See README.md for usage instructions.
20 
21 #include "tensorflow/tools/benchmark/benchmark_model.h"
22 
23 #include <cstdlib>
24 #include <memory>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28 
29 #include "tensorflow/core/common_runtime/graph_constructor.h"
30 #include "tensorflow/core/framework/graph.pb.h"
31 #include "tensorflow/core/framework/node_def.pb.h"
32 #include "tensorflow/core/framework/step_stats.pb.h"
33 #include "tensorflow/core/framework/tensor.h"
34 #include "tensorflow/core/graph/algorithm.h"
35 #include "tensorflow/core/graph/graph.h"
36 #include "tensorflow/core/lib/strings/numbers.h"
37 #include "tensorflow/core/lib/strings/str_util.h"
38 #include "tensorflow/core/lib/strings/strcat.h"
39 #include "tensorflow/core/platform/env.h"
40 #include "tensorflow/core/platform/init_main.h"
41 #include "tensorflow/core/platform/logging.h"
42 #include "tensorflow/core/platform/platform.h"
43 #include "tensorflow/core/platform/types.h"
44 #include "tensorflow/core/public/session.h"
45 #include "tensorflow/core/util/command_line_flags.h"
46 #include "tensorflow/core/util/reporter.h"
47 #include "tensorflow/core/util/stat_summarizer.h"
48 
49 namespace tensorflow {
50 namespace benchmark_model {
51 
52 namespace {
53 
InitializeVariables(Session * session,const std::vector<string> & init_ops)54 Status InitializeVariables(Session* session,
55                            const std::vector<string>& init_ops) {
56   LOG(INFO) << "Initializing graph variables";
57   for (const string& init_op : init_ops) {
58     TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
59   }
60   return Status::OK();
61 }
62 
63 template <class T>
InitializeTensor(const std::vector<float> & initialization_values,Tensor * input_tensor)64 void InitializeTensor(const std::vector<float>& initialization_values,
65                       Tensor* input_tensor) {
66   auto type_tensor = input_tensor->flat<T>();
67   type_tensor = type_tensor.constant(0);
68   if (!initialization_values.empty()) {
69     for (int i = 0; i < initialization_values.size(); ++i) {
70       type_tensor(i) = static_cast<T>(initialization_values[i]);
71     }
72   }
73 }
74 
CreateTensorsFromInputInfo(const std::vector<InputLayerInfo> & inputs,std::vector<std::pair<string,tensorflow::Tensor>> * input_tensors)75 void CreateTensorsFromInputInfo(
76     const std::vector<InputLayerInfo>& inputs,
77     std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
78   for (const InputLayerInfo& input : inputs) {
79     Tensor input_tensor(input.data_type, input.shape);
80     switch (input.data_type) {
81       case DT_INT32: {
82         InitializeTensor<int32>(input.initialization_values, &input_tensor);
83         break;
84       }
85       case DT_FLOAT: {
86         InitializeTensor<float>(input.initialization_values, &input_tensor);
87         break;
88       }
89       case DT_QUINT8: {
90         InitializeTensor<quint8>(input.initialization_values, &input_tensor);
91         break;
92       }
93       case DT_UINT8: {
94         InitializeTensor<uint8>(input.initialization_values, &input_tensor);
95         break;
96       }
97       case DT_BOOL: {
98         InitializeTensor<bool>(input.initialization_values, &input_tensor);
99         break;
100       }
101       case DT_STRING: {
102         if (!input.initialization_values.empty()) {
103           LOG(FATAL) << "Initialization values are not supported for strings";
104         }
105         auto type_tensor = input_tensor.flat<tstring>();
106         type_tensor = type_tensor.constant("");
107         break;
108       }
109       default:
110         LOG(FATAL) << "Unsupported input type: "
111                    << DataTypeString(input.data_type);
112     }
113     input_tensors->push_back({input.name, input_tensor});
114   }
115 }
116 
GetOutputShapes(const std::vector<InputLayerInfo> & inputs,const std::set<string> & wanted_shapes,Session * session,std::unordered_map<string,TensorShape> * node_shapes)117 Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
118                        const std::set<string>& wanted_shapes, Session* session,
119                        std::unordered_map<string, TensorShape>* node_shapes) {
120   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
121   CreateTensorsFromInputInfo(inputs, &input_tensors);
122   std::vector<tensorflow::Tensor> output_tensors;
123   std::vector<string> output_tensor_names;
124   for (const string& wanted_shape : wanted_shapes) {
125     bool is_input = false;
126     for (const std::pair<string, tensorflow::Tensor>& input_tensor :
127          input_tensors) {
128       if (input_tensor.first == wanted_shape) {
129         (*node_shapes)[wanted_shape] = input_tensor.second.shape();
130         is_input = true;
131         break;
132       }
133     }
134     if (!is_input) {
135       output_tensor_names.push_back(wanted_shape);
136     }
137   }
138   TF_RETURN_IF_ERROR(
139       session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
140   CHECK_EQ(output_tensors.size(), output_tensor_names.size());
141   for (int i = 0; i < output_tensor_names.size(); ++i) {
142     const string& wanted_shape_name = output_tensor_names[i];
143     const TensorShape& found_shape = output_tensors[i].shape();
144     (*node_shapes)[wanted_shape_name] = found_shape;
145   }
146   return Status::OK();
147 }
148 
CalculateFlops(const GraphDef & graph,const std::vector<InputLayerInfo> & inputs,Session * session,int64 * total_flops,std::unordered_map<string,int64> * flops_by_op)149 Status CalculateFlops(const GraphDef& graph,
150                       const std::vector<InputLayerInfo>& inputs,
151                       Session* session, int64* total_flops,
152                       std::unordered_map<string, int64>* flops_by_op) {
153   std::unordered_set<string> floppable_ops = {
154       "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
155       "DepthwiseConv2dNative"};
156 
157   std::set<string> wanted_shapes;
158   for (const NodeDef& node : graph.node()) {
159     if (floppable_ops.count(node.op())) {
160       for (const string& input : node.input()) {
161         wanted_shapes.insert(input);
162       }
163       wanted_shapes.insert(node.name());
164     }
165   }
166   std::unordered_map<string, TensorShape> found_shapes;
167   TF_RETURN_IF_ERROR(
168       GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
169 
170   *total_flops = 0;
171   for (const NodeDef& node : graph.node()) {
172     if (floppable_ops.count(node.op())) {
173       int64 current_flops = 0;
174       // This is a very crude approximation to FLOPs that only looks at a few
175       // op types that commonly form the bulk of the computation for many
176       // models. It's included here because getting even an approximate value
177       // for FLOPs is still very useful for estimating utilization, versus a
178       // device's theoretical maximum FLOPs/second.
179       if ((node.op() == "Conv2D") || (node.op() == "QuantizedConv2D")) {
180         const TensorShape& filter_shape = found_shapes[node.input(1)];
181         const TensorShape& output_shape = found_shapes[node.name()];
182         int64 filter_height = filter_shape.dim_size(0);
183         int64 filter_width = filter_shape.dim_size(1);
184         int64 filter_in_depth = filter_shape.dim_size(2);
185         int64 output_count = output_shape.num_elements();
186         current_flops =
187             output_count * filter_in_depth * filter_height * filter_width * 2;
188       } else if ((node.op() == "MatMul") || (node.op() == "QuantizedMatMul")) {
189         const bool transpose_a = node.attr().at("transpose_a").b();
190         const TensorShape& a_shape = found_shapes[node.input(0)];
191         const TensorShape& output_shape = found_shapes[node.name()];
192         int64 k;
193         if (transpose_a) {
194           k = a_shape.dim_size(0);
195         } else {
196           k = a_shape.dim_size(1);
197         }
198         int64 output_count = output_shape.num_elements();
199         current_flops = k * output_count * 2;
200       } else if (node.op() == "DepthwiseConv2dNative") {
201         const TensorShape& filter_shape = found_shapes[node.input(1)];
202         const TensorShape& output_shape = found_shapes[node.name()];
203         int64 filter_height = filter_shape.dim_size(0);
204         int64 filter_width = filter_shape.dim_size(1);
205         int64 output_count = output_shape.num_elements();
206         current_flops = output_count * filter_height * filter_width * 2;
207       }
208       (*flops_by_op)[node.op()] += current_flops;
209       *total_flops += current_flops;
210     }
211   }
212   return Status::OK();
213 }
214 
RecordBenchmarkEntry(const string & output_prefix,const string & benchmark_name,const string & postfix,int num_runs,double total_time_s,double throughput=-1.0)215 void RecordBenchmarkEntry(const string& output_prefix,
216                           const string& benchmark_name, const string& postfix,
217                           int num_runs, double total_time_s,
218                           double throughput = -1.0) {
219   std::stringstream stream;
220   stream << benchmark_name;
221   if (!postfix.empty()) {
222     stream << "_" << postfix;
223   }
224 
225   TestReporter node_reporter(output_prefix, stream.str());
226   TF_QCHECK_OK(node_reporter.Initialize());
227   TF_QCHECK_OK(
228       node_reporter.Benchmark(num_runs, -1.0, total_time_s, throughput));
229   TF_QCHECK_OK(node_reporter.Close());
230 }
231 
SleepSeconds(double sleep_seconds)232 void SleepSeconds(double sleep_seconds) {
233   if (sleep_seconds <= 0.0) {
234     return;
235   }
236 #ifdef PLATFORM_WINDOWS
237   Sleep(sleep_seconds * 1000);
238 #else
239   // Convert the inference_delay string into a timespec.
240   timespec req;
241   req.tv_sec = static_cast<time_t>(sleep_seconds);
242   req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
243   nanosleep(&req, nullptr);
244 #endif
245 }
246 
247 }  // namespace
248 
InitializeSession(int num_threads,const string & graph,std::unique_ptr<Session> * session,std::unique_ptr<GraphDef> * graph_def)249 Status InitializeSession(int num_threads, const string& graph,
250                          std::unique_ptr<Session>* session,
251                          std::unique_ptr<GraphDef>* graph_def) {
252   LOG(INFO) << "Loading TensorFlow.";
253 
254   tensorflow::SessionOptions options;
255   tensorflow::ConfigProto& config = options.config;
256   if (num_threads > 0) {
257     config.set_intra_op_parallelism_threads(num_threads);
258     config.set_inter_op_parallelism_threads(num_threads);
259   }
260   LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
261 
262   session->reset(tensorflow::NewSession(options));
263   graph_def->reset(new GraphDef());
264   tensorflow::GraphDef tensorflow_graph;
265   Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
266   if (!s.ok()) {
267     s = ReadTextProto(Env::Default(), graph, graph_def->get());
268   }
269 
270   if (!s.ok()) {
271     LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
272     return s;
273   }
274 
275   s = (*session)->Create(*(graph_def->get()));
276   if (!s.ok()) {
277     LOG(ERROR) << "Could not create TensorFlow Session: " << s;
278     return s;
279   }
280 
281   return Status::OK();
282 }
283 
RunBenchmark(const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * inference_time_us)284 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
285                     const std::vector<string>& outputs,
286                     const std::vector<string>& targets, Session* session,
287                     StatSummarizer* stats, int64* inference_time_us) {
288   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
289   CreateTensorsFromInputInfo(inputs, &input_tensors);
290 
291   std::vector<tensorflow::Tensor> output_tensors;
292 
293   tensorflow::Status s;
294 
295   RunOptions run_options;
296   if (stats != nullptr) {
297     run_options.set_trace_level(RunOptions::FULL_TRACE);
298   }
299 
300   RunMetadata run_metadata;
301   const int64 start_time = Env::Default()->NowMicros();
302   s = session->Run(run_options, input_tensors, outputs, targets,
303                    &output_tensors, &run_metadata);
304   const int64 end_time = Env::Default()->NowMicros();
305   *inference_time_us = end_time - start_time;
306 
307   if (!s.ok()) {
308     LOG(ERROR) << "Error during inference: " << s;
309     return s;
310   }
311 
312   if (stats != nullptr) {
313     assert(run_metadata.has_step_stats());
314     const StepStats& step_stats = run_metadata.step_stats();
315     stats->ProcessStepStats(step_stats);
316   }
317 
318   return s;
319 }
320 
TimeMultipleRuns(double sleep_seconds,int num_runs,double max_time_s,const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * total_time_us,int64 * actual_num_runs)321 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
322                         const std::vector<InputLayerInfo>& inputs,
323                         const std::vector<string>& outputs,
324                         const std::vector<string>& targets, Session* session,
325                         StatSummarizer* stats, int64* total_time_us,
326                         int64* actual_num_runs) {
327   *total_time_us = 0;
328 
329   LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
330             << max_time_s << " seconds "
331             << (stats != nullptr ? "with" : "without")
332             << " detailed stat logging, with " << sleep_seconds
333             << "s sleep between inferences";
334 
335   Stat<int64> stat;
336   const bool until_max_time = num_runs <= 0;
337   for (int i = 0; until_max_time || i < num_runs; ++i) {
338     int64 time;
339     Status run_status =
340         RunBenchmark(inputs, outputs, targets, session, stats, &time);
341     stat.UpdateStat(time);
342     (*total_time_us) += time;
343     ++(*actual_num_runs);
344 
345     if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
346       break;
347     }
348 
349     if (!run_status.ok()) {
350       LOG(INFO) << "Failed on run " << i;
351       return run_status;
352     }
353 
354     // If requested, sleep between runs for an arbitrary amount of time.
355     // This can be helpful to determine the effect of mobile processor
356     // scaling and thermal throttling.
357     if (sleep_seconds > 0.0) {
358       SleepSeconds(sleep_seconds);
359     }
360   }
361   std::stringstream stream;
362   stat.OutputToStream(&stream);
363   LOG(INFO) << stream.str() << std::endl;
364 
365   return Status::OK();
366 }
367 
Main(int argc,char ** argv)368 int Main(int argc, char** argv) {
369   string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
370   string init_ops_string = "";
371   string input_layer_string = "input:0";
372   string input_layer_shape_string = "1,224,224,3";
373   string input_layer_type_string = "float";
374   string input_layer_values_string = "";
375   string output_layer_string = "output:0";
376   string target_layer_string = "";
377   int max_num_runs = 1000;
378   string max_time = "10.0";
379   string inference_delay = "-1.0";
380   string inter_benchmark_delay = "-1.0";
381   int num_threads = -1;
382   string benchmark_name = "";
383   string output_prefix = "";
384   bool show_sizes = false;
385   bool show_run_order = true;
386   int run_order_limit = 0;
387   bool show_time = true;
388   int time_limit = 10;
389   bool show_memory = true;
390   int memory_limit = 10;
391   bool show_type = true;
392   bool show_summary = true;
393   bool show_flops = false;
394   int warmup_runs = 1;
395 
396   std::vector<Flag> flag_list = {
397       Flag("graph", &graph, "graph file name"),
398       Flag("init_ops", &init_ops_string, "init ops"),
399       Flag("input_layer", &input_layer_string, "input layer names"),
400       Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
401       Flag("input_layer_type", &input_layer_type_string, "input layer type"),
402       Flag("input_layer_values", &input_layer_values_string,
403            "values to initialize the inputs with"),
404       Flag("output_layer", &output_layer_string, "output layer name"),
405       Flag("target_layer", &target_layer_string, "target layer name"),
406       Flag("max_num_runs", &max_num_runs, "number of runs max"),
407       Flag("max_time", &max_time, "length to run max"),
408       Flag("inference_delay", &inference_delay,
409            "delay between runs in seconds"),
410       Flag("inter_benchmark_delay", &inter_benchmark_delay,
411            "delay between benchmarks in seconds"),
412       Flag("num_threads", &num_threads, "number of threads"),
413       Flag("benchmark_name", &benchmark_name, "benchmark name"),
414       Flag("output_prefix", &output_prefix, "benchmark output prefix"),
415       Flag("show_sizes", &show_sizes, "whether to show sizes"),
416       Flag("show_run_order", &show_run_order,
417            "whether to list stats by run order"),
418       Flag("run_order_limit", &run_order_limit,
419            "how many items to show by run order"),
420       Flag("show_time", &show_time, "whether to list stats by time taken"),
421       Flag("time_limit", &time_limit, "how many items to show by time taken"),
422       Flag("show_memory", &show_memory, "whether to list stats by memory used"),
423       Flag("memory_limit", &memory_limit,
424            "how many items to show by memory used"),
425       Flag("show_type", &show_type, "whether to list stats by op type"),
426       Flag("show_summary", &show_summary,
427            "whether to show a summary of the stats"),
428       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
429       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
430   };
431   string usage = Flags::Usage(argv[0], flag_list);
432   const bool parse_result = Flags::Parse(&argc, argv, flag_list);
433 
434   if (!parse_result) {
435     LOG(ERROR) << usage;
436     return -1;
437   }
438 
439   std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
440   std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
441   std::vector<string> input_layer_shapes =
442       str_util::Split(input_layer_shape_string, ':');
443   std::vector<string> input_layer_types =
444       str_util::Split(input_layer_type_string, ',');
445   std::vector<string> input_layer_values =
446       str_util::Split(input_layer_values_string, ':');
447   std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
448   std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
449   if ((input_layers.size() != input_layer_shapes.size()) ||
450       (input_layers.size() != input_layer_types.size())) {
451     LOG(ERROR) << "There must be the same number of items in --input_layer,"
452                << " --input_layer_shape, and --input_layer_type, for example"
453                << " --input_layer=input1,input2 --input_layer_type=float,float "
454                << " --input_layer_shape=1,224,224,4:1,20";
455     LOG(ERROR) << "--input_layer=" << input_layer_string << " ("
456                << input_layers.size() << " items)";
457     LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " ("
458                << input_layer_types.size() << " items)";
459     LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " ("
460                << input_layer_shapes.size() << " items)";
461     return -1;
462   }
463   const size_t inputs_count = input_layers.size();
464 
465   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
466   if (argc > 1) {
467     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
468     return -1;
469   }
470 
471   LOG(INFO) << "Graph: [" << graph << "]";
472   LOG(INFO) << "Init ops:" << init_ops_string;
473   LOG(INFO) << "Input layers: [" << input_layer_string << "]";
474   LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
475   LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
476   LOG(INFO) << "Output layers: [" << output_layer_string << "]";
477   LOG(INFO) << "Target layers: [" << target_layer_string << "]";
478   LOG(INFO) << "Num runs: [" << max_num_runs << "]";
479   LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
480   LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
481             << "]";
482   LOG(INFO) << "Num threads: [" << num_threads << "]";
483   LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
484   LOG(INFO) << "Output prefix: [" << output_prefix << "]";
485   LOG(INFO) << "Show sizes: [" << show_sizes << "]";
486   LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
487 
488   std::unique_ptr<Session> session;
489   std::unique_ptr<StatSummarizer> stats;
490   std::unique_ptr<GraphDef> graph_def;
491 
492   int64 initialization_start_us = Env::Default()->NowMicros();
493   Status initialize_status =
494       InitializeSession(num_threads, graph, &session, &graph_def);
495   int64 initialization_end_us = Env::Default()->NowMicros();
496   double initialization_time_s =
497       (initialization_end_us - initialization_start_us) / 1000000.0;
498   LOG(INFO) << "Initialized session in " << initialization_time_s << "s";
499   if (!initialize_status.ok()) {
500     return -1;
501   }
502 
503   if (!init_ops.empty()) {
504     Status initialize_variables_status =
505         InitializeVariables(session.get(), init_ops);
506     if (!initialize_variables_status.ok()) {
507       LOG(ERROR) << "Graph variables initialization failed with "
508                  << initialize_variables_status;
509       return -1;
510     }
511   }
512 
513   StatSummarizerOptions stats_options;
514   stats_options.show_run_order = show_run_order;
515   stats_options.run_order_limit = run_order_limit;
516   stats_options.show_time = show_time;
517   stats_options.time_limit = time_limit;
518   stats_options.show_memory = show_memory;
519   stats_options.memory_limit = memory_limit;
520   stats_options.show_type = show_type;
521   stats_options.show_summary = show_summary;
522   stats.reset(new tensorflow::StatSummarizer(stats_options));
523 
524   const double inter_inference_sleep_seconds =
525       std::strtod(inference_delay.c_str(), nullptr);
526   const double inter_benchmark_sleep_seconds =
527       std::strtod(inter_benchmark_delay.c_str(), nullptr);
528   const double max_benchmark_time_seconds =
529       std::strtod(max_time.c_str(), nullptr);
530 
531   std::vector<InputLayerInfo> inputs;
532   for (int n = 0; n < inputs_count; ++n) {
533     InputLayerInfo input;
534     CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
535         << input_layer_types[n] << " was an invalid type";
536 
537     std::vector<string> split_layer_shapes =
538         str_util::Split(input_layer_shapes[n], ',');
539     for (const string& layer_shape : split_layer_shapes) {
540       int32 tmp;
541       CHECK(strings::safe_strto32(layer_shape, &tmp))
542           << "Incorrect size string specified: " << input_layer_shapes[n];
543       if (tmp == -1) {
544         LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
545                    << " with the size you want to benchmark with.";
546         return -1;
547       } else {
548         input.shape.AddDim(tmp);
549       }
550     }
551     input.name = input_layers[n];
552     if (n < input_layer_values.size()) {
553       std::vector<string> string_tokens =
554           str_util::Split(input_layer_values[n], ',');
555       input.initialization_values.clear();
556       input.initialization_values.reserve(string_tokens.size());
557       for (const string& str_val : string_tokens) {
558         float val;
559         CHECK(strings::safe_strtof(str_val, &val))
560             << "Incorrect initialization values string specified: "
561             << input_layer_values[n];
562         input.initialization_values.push_back(val);
563       }
564     }
565     inputs.push_back(input);
566   }
567 
568   // If requested, run through the graph first to preinitialize everything
569   // before the benchmarking runs.
570   int64 warmup_time_us = 0;
571   int64 num_warmup_runs = 0;
572   if (warmup_runs > 0) {
573     Status warmup_time_status =
574         TimeMultipleRuns(inter_inference_sleep_seconds, warmup_runs, -1.0,
575                          inputs, output_layers, target_layers, session.get(),
576                          nullptr, &warmup_time_us, &num_warmup_runs);
577     if (!warmup_time_status.ok()) {
578       LOG(ERROR) << "Timing failed with " << warmup_time_status;
579       return -1;
580     }
581   }
582 
583   // Capture overall inference time without stat logging overhead. This is the
584   // timing data that can be compared to other libraries.
585   SleepSeconds(inter_benchmark_sleep_seconds);
586   int64 no_stat_time_us = 0;
587   int64 no_stat_num_runs = 0;
588   Status no_stat_time_status = TimeMultipleRuns(
589       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
590       inputs, output_layers, target_layers, session.get(), nullptr,
591       &no_stat_time_us, &no_stat_num_runs);
592   const double no_stat_wall_time = no_stat_time_us / 1000000.0;
593   if (!no_stat_time_status.ok()) {
594     LOG(ERROR) << "Timing failed with " << no_stat_time_status;
595     return -1;
596   }
597 
598   // Run again to gather detailed log stats to get a better idea of where
599   // relative time is going within the graph.
600   SleepSeconds(inter_benchmark_sleep_seconds);
601   int64 stat_time_us = 0;
602   int64 stat_num_runs = 0;
603   Status stat_time_status = TimeMultipleRuns(
604       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
605       inputs, output_layers, target_layers, session.get(), stats.get(),
606       &stat_time_us, &stat_num_runs);
607   if (!stat_time_status.ok()) {
608     LOG(ERROR) << "Timing failed with " << stat_time_status;
609     return -1;
610   }
611 
612   LOG(INFO) << "Average inference timings in us: "
613             << "Warmup: "
614             << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
615             << "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
616             << "with stats: " << stat_time_us / stat_num_runs;
617 
618   stats->PrintStepStats();
619 
620   if (show_sizes) {
621     stats->PrintOutputs();
622   }
623 
624   if (show_flops) {
625     int64 total_flops;
626     std::unordered_map<string, int64> flops_by_op;
627     Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
628                                         &total_flops, &flops_by_op);
629     if (!flop_status.ok()) {
630       LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
631       return -1;
632     }
633     string pretty_flops;
634     if (total_flops < 1000) {
635       pretty_flops = strings::StrCat(total_flops, " FLOPs");
636     } else if (total_flops < (1000 * 1000)) {
637       const float rounded_flops = (total_flops / 1000.0f);
638       pretty_flops = strings::StrCat(rounded_flops, "k FLOPs");
639     } else if (total_flops < (1000 * 1000 * 1000)) {
640       const float rounded_flops = round(total_flops / 1000.0f) / 1000.0f;
641       pretty_flops = strings::StrCat(rounded_flops, " million FLOPs");
642     } else {
643       const float rounded_flops =
644           round(total_flops / (1000.0f * 1000.0f)) / 1000.0f;
645       pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
646     }
647     LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
648     const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
649     LOG(INFO) << "FLOPs/second: "
650               << strings::HumanReadableNum(
651                      static_cast<int64>(total_flops / mean_run_time));
652   }
653 
654   if (!benchmark_name.empty() && !output_prefix.empty()) {
655     // Compute the total number of values per input.
656     int64 total_size = inputs[0].shape.num_elements();
657 
658     // Throughput in MB/s
659     const double throughput =
660         DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
661         static_cast<double>(no_stat_wall_time) / (1024 * 1024);
662 
663     // Report the stats.
664     RecordBenchmarkEntry(output_prefix, benchmark_name, "", no_stat_num_runs,
665                          no_stat_wall_time, throughput);
666 
667     // Session initialization time.
668     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-init", 1,
669                          initialization_time_s);
670 
671     // First inference time. Note: if warmup_runs is > 1 this will actually be
672     // an average of all the warmup runs.
673     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
674                          warmup_runs, warmup_time_us / 1000000.0);
675 
676     // Time from starting to initialize TF to getting the first result back.
677     // This also assumes that only one warmup run is performed.
678     RecordBenchmarkEntry(
679         output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
680         initialization_time_s + (warmup_time_us / 1000000.0) / warmup_runs);
681 
682     std::map<std::string, int64_t> node_type_map_count;
683     std::map<std::string, int64_t> node_type_map_time;
684     std::map<std::string, int64_t> node_type_map_memory;
685     std::map<std::string, int64_t> node_type_map_times_called;
686 
687     int64_t accumulated_us;
688     stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
689                               &node_type_map_memory,
690                               &node_type_map_times_called, &accumulated_us);
691     for (const auto& time : node_type_map_time) {
692       LOG(INFO) << "Outputting: [" << time.first << "]";
693       RecordBenchmarkEntry(output_prefix, benchmark_name, time.first,
694                            stat_num_runs,
695                            (time.second * stat_num_runs) / 1000000.0f);
696     }
697   }
698 
699   return 0;
700 }
701 
702 }  // namespace benchmark_model
703 }  // namespace tensorflow
704