• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // A C++ binary to benchmark a compute graph and its individual operators,
17 // both on desktop machines and on Android.
18 //
19 // See README.md for usage instructions.
20 
21 #include "tensorflow/tools/benchmark/benchmark_model.h"
22 
23 #include <cstdlib>
24 #include <memory>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28 
29 #include "tensorflow/core/framework/graph.pb.h"
30 #include "tensorflow/core/framework/node_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.h"
33 #include "tensorflow/core/graph/algorithm.h"
34 #include "tensorflow/core/graph/graph.h"
35 #include "tensorflow/core/graph/graph_constructor.h"
36 #include "tensorflow/core/lib/strings/str_util.h"
37 #include "tensorflow/core/lib/strings/strcat.h"
38 #include "tensorflow/core/platform/env.h"
39 #include "tensorflow/core/platform/init_main.h"
40 #include "tensorflow/core/platform/logging.h"
41 #include "tensorflow/core/platform/platform.h"
42 #include "tensorflow/core/platform/types.h"
43 #include "tensorflow/core/public/session.h"
44 #include "tensorflow/core/util/command_line_flags.h"
45 #include "tensorflow/core/util/reporter.h"
46 #include "tensorflow/core/util/stat_summarizer.h"
47 
48 namespace tensorflow {
49 namespace benchmark_model {
50 
51 namespace {
52 
InitializeVariables(Session * session,const std::vector<string> & init_ops)53 Status InitializeVariables(Session* session,
54                            const std::vector<string>& init_ops) {
55   LOG(INFO) << "Initializing graph variables";
56   for (const string& init_op : init_ops) {
57     TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
58   }
59   return Status::OK();
60 }
61 
62 template <class T>
InitializeTensor(const std::vector<float> & initialization_values,Tensor * input_tensor)63 void InitializeTensor(const std::vector<float>& initialization_values,
64                       Tensor* input_tensor) {
65   auto type_tensor = input_tensor->flat<T>();
66   type_tensor = type_tensor.constant(0);
67   if (!initialization_values.empty()) {
68     for (int i = 0; i < initialization_values.size(); ++i) {
69       type_tensor(i) = static_cast<T>(initialization_values[i]);
70     }
71   }
72 }
73 
CreateTensorsFromInputInfo(const std::vector<InputLayerInfo> & inputs,std::vector<std::pair<string,tensorflow::Tensor>> * input_tensors)74 void CreateTensorsFromInputInfo(
75     const std::vector<InputLayerInfo>& inputs,
76     std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
77   for (const InputLayerInfo& input : inputs) {
78     Tensor input_tensor(input.data_type, input.shape);
79     switch (input.data_type) {
80       case DT_INT32: {
81         InitializeTensor<int32>(input.initialization_values, &input_tensor);
82         break;
83       }
84       case DT_FLOAT: {
85         InitializeTensor<float>(input.initialization_values, &input_tensor);
86         break;
87       }
88       case DT_QUINT8: {
89         InitializeTensor<quint8>(input.initialization_values, &input_tensor);
90         break;
91       }
92       case DT_UINT8: {
93         InitializeTensor<uint8>(input.initialization_values, &input_tensor);
94         break;
95       }
96       case DT_BOOL: {
97         InitializeTensor<bool>(input.initialization_values, &input_tensor);
98         break;
99       }
100       case DT_STRING: {
101         if (!input.initialization_values.empty()) {
102           LOG(FATAL) << "Initialization values are not supported for strings";
103         }
104         auto type_tensor = input_tensor.flat<string>();
105         type_tensor = type_tensor.constant("");
106         break;
107       }
108       default:
109         LOG(FATAL) << "Unsupported input type: "
110                    << DataTypeString(input.data_type);
111     }
112     input_tensors->push_back({input.name, input_tensor});
113   }
114 }
115 
GetOutputShapes(const std::vector<InputLayerInfo> & inputs,const std::set<string> & wanted_shapes,Session * session,std::unordered_map<string,TensorShape> * node_shapes)116 Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
117                        const std::set<string>& wanted_shapes, Session* session,
118                        std::unordered_map<string, TensorShape>* node_shapes) {
119   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
120   CreateTensorsFromInputInfo(inputs, &input_tensors);
121   std::vector<tensorflow::Tensor> output_tensors;
122   std::vector<string> output_tensor_names;
123   for (const string& wanted_shape : wanted_shapes) {
124     bool is_input = false;
125     for (const std::pair<string, tensorflow::Tensor>& input_tensor :
126          input_tensors) {
127       if (input_tensor.first == wanted_shape) {
128         (*node_shapes)[wanted_shape] = input_tensor.second.shape();
129         is_input = true;
130         break;
131       }
132     }
133     if (!is_input) {
134       output_tensor_names.push_back(wanted_shape);
135     }
136   }
137   TF_RETURN_IF_ERROR(
138       session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
139   CHECK_EQ(output_tensors.size(), output_tensor_names.size());
140   for (int i = 0; i < output_tensor_names.size(); ++i) {
141     const string& wanted_shape_name = output_tensor_names[i];
142     const TensorShape& found_shape = output_tensors[i].shape();
143     (*node_shapes)[wanted_shape_name] = found_shape;
144   }
145   return Status::OK();
146 }
147 
CalculateFlops(const GraphDef & graph,const std::vector<InputLayerInfo> & inputs,Session * session,int64 * total_flops,std::unordered_map<string,int64> * flops_by_op)148 Status CalculateFlops(const GraphDef& graph,
149                       const std::vector<InputLayerInfo>& inputs,
150                       Session* session, int64* total_flops,
151                       std::unordered_map<string, int64>* flops_by_op) {
152   std::unordered_set<string> floppable_ops = {
153       "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
154       "DepthwiseConv2dNative"};
155 
156   std::set<string> wanted_shapes;
157   for (const NodeDef& node : graph.node()) {
158     if (floppable_ops.count(node.op())) {
159       for (const string& input : node.input()) {
160         wanted_shapes.insert(input);
161       }
162       wanted_shapes.insert(node.name());
163     }
164   }
165   std::unordered_map<string, TensorShape> found_shapes;
166   TF_RETURN_IF_ERROR(
167       GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
168 
169   *total_flops = 0;
170   for (const NodeDef& node : graph.node()) {
171     if (floppable_ops.count(node.op())) {
172       int64 current_flops = 0;
173       // This is a very crude approximation to FLOPs that only looks at a few
174       // op types that commonly form the bulk of the computation for many
175       // models. It's included here because getting even an approximate value
176       // for FLOPs is still very useful for estimating utilization, versus a
177       // device's theoretical maximum FLOPs/second.
178       if ((node.op() == "Conv2D") || (node.op() == "QuantizedConv2D")) {
179         const TensorShape& filter_shape = found_shapes[node.input(1)];
180         const TensorShape& output_shape = found_shapes[node.name()];
181         int64 filter_height = filter_shape.dim_size(0);
182         int64 filter_width = filter_shape.dim_size(1);
183         int64 filter_in_depth = filter_shape.dim_size(2);
184         int64 output_count = output_shape.num_elements();
185         current_flops =
186             output_count * filter_in_depth * filter_height * filter_width * 2;
187       } else if ((node.op() == "MatMul") || (node.op() == "QuantizedMatMul")) {
188         const bool transpose_a = node.attr().at("transpose_a").b();
189         const TensorShape& a_shape = found_shapes[node.input(0)];
190         const TensorShape& output_shape = found_shapes[node.name()];
191         int64 k;
192         if (transpose_a) {
193           k = a_shape.dim_size(0);
194         } else {
195           k = a_shape.dim_size(1);
196         }
197         int64 output_count = output_shape.num_elements();
198         current_flops = k * output_count * 2;
199       } else if (node.op() == "DepthwiseConv2dNative") {
200         const TensorShape& filter_shape = found_shapes[node.input(1)];
201         const TensorShape& output_shape = found_shapes[node.name()];
202         int64 filter_height = filter_shape.dim_size(0);
203         int64 filter_width = filter_shape.dim_size(1);
204         int64 output_count = output_shape.num_elements();
205         current_flops = output_count * filter_height * filter_width * 2;
206       }
207       (*flops_by_op)[node.op()] += current_flops;
208       *total_flops += current_flops;
209     }
210   }
211   return Status::OK();
212 }
213 
RecordBenchmarkEntry(const string & output_prefix,const string & benchmark_name,const string & postfix,int num_runs,double total_time_s,double throughput=-1.0)214 void RecordBenchmarkEntry(const string& output_prefix,
215                           const string& benchmark_name, const string& postfix,
216                           int num_runs, double total_time_s,
217                           double throughput = -1.0) {
218   std::stringstream stream;
219   stream << benchmark_name;
220   if (!postfix.empty()) {
221     stream << "_" << postfix;
222   }
223 
224   TestReporter node_reporter(output_prefix, stream.str());
225   TF_QCHECK_OK(node_reporter.Initialize());
226   TF_QCHECK_OK(
227       node_reporter.Benchmark(num_runs, -1.0, total_time_s, throughput));
228   TF_QCHECK_OK(node_reporter.Close());
229 }
230 
SleepSeconds(double sleep_seconds)231 void SleepSeconds(double sleep_seconds) {
232   if (sleep_seconds <= 0.0) {
233     return;
234   }
235 #ifdef PLATFORM_WINDOWS
236   Sleep(sleep_seconds * 1000);
237 #else
238   // Convert the inference_delay string into a timespec.
239   timespec req;
240   req.tv_sec = static_cast<time_t>(sleep_seconds);
241   req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
242   nanosleep(&req, nullptr);
243 #endif
244 }
245 
246 }  // namespace
247 
InitializeSession(int num_threads,const string & graph,std::unique_ptr<Session> * session,std::unique_ptr<GraphDef> * graph_def)248 Status InitializeSession(int num_threads, const string& graph,
249                          std::unique_ptr<Session>* session,
250                          std::unique_ptr<GraphDef>* graph_def) {
251   LOG(INFO) << "Loading TensorFlow.";
252 
253   tensorflow::SessionOptions options;
254   tensorflow::ConfigProto& config = options.config;
255   if (num_threads > 0) {
256     config.set_intra_op_parallelism_threads(num_threads);
257     config.set_inter_op_parallelism_threads(num_threads);
258   }
259   LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
260 
261   session->reset(tensorflow::NewSession(options));
262   graph_def->reset(new GraphDef());
263   tensorflow::GraphDef tensorflow_graph;
264   Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
265   if (!s.ok()) {
266     s = ReadTextProto(Env::Default(), graph, graph_def->get());
267   }
268 
269   if (!s.ok()) {
270     LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
271     return s;
272   }
273 
274   s = (*session)->Create(*(graph_def->get()));
275   if (!s.ok()) {
276     LOG(ERROR) << "Could not create TensorFlow Session: " << s;
277     return s;
278   }
279 
280   return Status::OK();
281 }
282 
RunBenchmark(const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * inference_time_us)283 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
284                     const std::vector<string>& outputs,
285                     const std::vector<string>& targets, Session* session,
286                     StatSummarizer* stats, int64* inference_time_us) {
287   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
288   CreateTensorsFromInputInfo(inputs, &input_tensors);
289 
290   std::vector<tensorflow::Tensor> output_tensors;
291 
292   tensorflow::Status s;
293 
294   RunOptions run_options;
295   if (stats != nullptr) {
296     run_options.set_trace_level(RunOptions::FULL_TRACE);
297   }
298 
299   RunMetadata run_metadata;
300   const int64 start_time = Env::Default()->NowMicros();
301   s = session->Run(run_options, input_tensors, outputs, targets,
302                    &output_tensors, &run_metadata);
303   const int64 end_time = Env::Default()->NowMicros();
304   *inference_time_us = end_time - start_time;
305 
306   if (!s.ok()) {
307     LOG(ERROR) << "Error during inference: " << s;
308     return s;
309   }
310 
311   if (stats != nullptr) {
312     assert(run_metadata.has_step_stats());
313     const StepStats& step_stats = run_metadata.step_stats();
314     stats->ProcessStepStats(step_stats);
315   }
316 
317   return s;
318 }
319 
TimeMultipleRuns(double sleep_seconds,int num_runs,double max_time_s,const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64 * total_time_us,int64 * actual_num_runs)320 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
321                         const std::vector<InputLayerInfo>& inputs,
322                         const std::vector<string>& outputs,
323                         const std::vector<string>& targets, Session* session,
324                         StatSummarizer* stats, int64* total_time_us,
325                         int64* actual_num_runs) {
326   *total_time_us = 0;
327 
328   LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
329             << max_time_s << " seconds "
330             << (stats != nullptr ? "with" : "without")
331             << " detailed stat logging, with " << sleep_seconds
332             << "s sleep between inferences";
333 
334   Stat<int64> stat;
335   const bool until_max_time = num_runs <= 0;
336   for (int i = 0; until_max_time || i < num_runs; ++i) {
337     int64 time;
338     Status run_status =
339         RunBenchmark(inputs, outputs, targets, session, stats, &time);
340     stat.UpdateStat(time);
341     (*total_time_us) += time;
342     ++(*actual_num_runs);
343 
344     if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
345       break;
346     }
347 
348     if (!run_status.ok()) {
349       LOG(INFO) << "Failed on run " << i;
350       return run_status;
351     }
352 
353     // If requested, sleep between runs for an arbitrary amount of time.
354     // This can be helpful to determine the effect of mobile processor
355     // scaling and thermal throttling.
356     if (sleep_seconds > 0.0) {
357       SleepSeconds(sleep_seconds);
358     }
359   }
360   std::stringstream stream;
361   stat.OutputToStream(&stream);
362   LOG(INFO) << stream.str() << std::endl;
363 
364   return Status::OK();
365 }
366 
Main(int argc,char ** argv)367 int Main(int argc, char** argv) {
368   string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
369   string init_ops_string = "";
370   string input_layer_string = "input:0";
371   string input_layer_shape_string = "1,224,224,3";
372   string input_layer_type_string = "float";
373   string input_layer_values_string = "";
374   string output_layer_string = "output:0";
375   string target_layer_string = "";
376   int max_num_runs = 1000;
377   string max_time = "10.0";
378   string inference_delay = "-1.0";
379   string inter_benchmark_delay = "-1.0";
380   int num_threads = -1;
381   string benchmark_name = "";
382   string output_prefix = "";
383   bool show_sizes = false;
384   bool show_run_order = true;
385   int run_order_limit = 0;
386   bool show_time = true;
387   int time_limit = 10;
388   bool show_memory = true;
389   int memory_limit = 10;
390   bool show_type = true;
391   bool show_summary = true;
392   bool show_flops = false;
393   int warmup_runs = 1;
394 
395   std::vector<Flag> flag_list = {
396       Flag("graph", &graph, "graph file name"),
397       Flag("init_ops", &init_ops_string, "init ops"),
398       Flag("input_layer", &input_layer_string, "input layer names"),
399       Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
400       Flag("input_layer_type", &input_layer_type_string, "input layer type"),
401       Flag("input_layer_values", &input_layer_values_string,
402            "values to initialize the inputs with"),
403       Flag("output_layer", &output_layer_string, "output layer name"),
404       Flag("target_layer", &target_layer_string, "target layer name"),
405       Flag("max_num_runs", &max_num_runs, "number of runs max"),
406       Flag("max_time", &max_time, "length to run max"),
407       Flag("inference_delay", &inference_delay,
408            "delay between runs in seconds"),
409       Flag("inter_benchmark_delay", &inter_benchmark_delay,
410            "delay between benchmarks in seconds"),
411       Flag("num_threads", &num_threads, "number of threads"),
412       Flag("benchmark_name", &benchmark_name, "benchmark name"),
413       Flag("output_prefix", &output_prefix, "benchmark output prefix"),
414       Flag("show_sizes", &show_sizes, "whether to show sizes"),
415       Flag("show_run_order", &show_run_order,
416            "whether to list stats by run order"),
417       Flag("run_order_limit", &run_order_limit,
418            "how many items to show by run order"),
419       Flag("show_time", &show_time, "whether to list stats by time taken"),
420       Flag("time_limit", &time_limit, "how many items to show by time taken"),
421       Flag("show_memory", &show_memory, "whether to list stats by memory used"),
422       Flag("memory_limit", &memory_limit,
423            "how many items to show by memory used"),
424       Flag("show_type", &show_type, "whether to list stats by op type"),
425       Flag("show_summary", &show_summary,
426            "whether to show a summary of the stats"),
427       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
428       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
429   };
430   string usage = Flags::Usage(argv[0], flag_list);
431   const bool parse_result = Flags::Parse(&argc, argv, flag_list);
432 
433   if (!parse_result) {
434     LOG(ERROR) << usage;
435     return -1;
436   }
437 
438   std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
439   std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
440   std::vector<string> input_layer_shapes =
441       str_util::Split(input_layer_shape_string, ':');
442   std::vector<string> input_layer_types =
443       str_util::Split(input_layer_type_string, ',');
444   std::vector<string> input_layer_values =
445       str_util::Split(input_layer_values_string, ':');
446   std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
447   std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
448   if ((input_layers.size() != input_layer_shapes.size()) ||
449       (input_layers.size() != input_layer_types.size())) {
450     LOG(ERROR) << "There must be the same number of items in --input_layer,"
451                << " --input_layer_shape, and --input_layer_type, for example"
452                << " --input_layer=input1,input2 --input_layer_type=float,float "
453                << " --input_layer_shape=1,224,224,4:1,20";
454     LOG(ERROR) << "--input_layer=" << input_layer_string << " ("
455                << input_layers.size() << " items)";
456     LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " ("
457                << input_layer_types.size() << " items)";
458     LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " ("
459                << input_layer_shapes.size() << " items)";
460     return -1;
461   }
462   const size_t inputs_count = input_layers.size();
463 
464   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
465   if (argc > 1) {
466     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
467     return -1;
468   }
469 
470   LOG(INFO) << "Graph: [" << graph << "]";
471   LOG(INFO) << "Init ops:" << init_ops_string;
472   LOG(INFO) << "Input layers: [" << input_layer_string << "]";
473   LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
474   LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
475   LOG(INFO) << "Output layers: [" << output_layer_string << "]";
476   LOG(INFO) << "Target layers: [" << target_layer_string << "]";
477   LOG(INFO) << "Num runs: [" << max_num_runs << "]";
478   LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
479   LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
480             << "]";
481   LOG(INFO) << "Num threads: [" << num_threads << "]";
482   LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
483   LOG(INFO) << "Output prefix: [" << output_prefix << "]";
484   LOG(INFO) << "Show sizes: [" << show_sizes << "]";
485   LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
486 
487   std::unique_ptr<Session> session;
488   std::unique_ptr<StatSummarizer> stats;
489   std::unique_ptr<GraphDef> graph_def;
490 
491   int64 initialization_start_us = Env::Default()->NowMicros();
492   Status initialize_status =
493       InitializeSession(num_threads, graph, &session, &graph_def);
494   int64 initialization_end_us = Env::Default()->NowMicros();
495   double initialization_time_s =
496       (initialization_end_us - initialization_start_us) / 1000000.0;
497   LOG(INFO) << "Initialized session in " << initialization_time_s << "s";
498   if (!initialize_status.ok()) {
499     return -1;
500   }
501 
502   if (!init_ops.empty()) {
503     Status initialize_variables_status =
504         InitializeVariables(session.get(), init_ops);
505     if (!initialize_variables_status.ok()) {
506       LOG(ERROR) << "Graph variables initialization failed with "
507                  << initialize_variables_status;
508       return -1;
509     }
510   }
511 
512   StatSummarizerOptions stats_options;
513   stats_options.show_run_order = show_run_order;
514   stats_options.run_order_limit = run_order_limit;
515   stats_options.show_time = show_time;
516   stats_options.time_limit = time_limit;
517   stats_options.show_memory = show_memory;
518   stats_options.memory_limit = memory_limit;
519   stats_options.show_type = show_type;
520   stats_options.show_summary = show_summary;
521   stats.reset(new tensorflow::StatSummarizer(stats_options));
522 
523   const double inter_inference_sleep_seconds =
524       std::strtod(inference_delay.c_str(), nullptr);
525   const double inter_benchmark_sleep_seconds =
526       std::strtod(inter_benchmark_delay.c_str(), nullptr);
527   const double max_benchmark_time_seconds =
528       std::strtod(max_time.c_str(), nullptr);
529 
530   std::vector<InputLayerInfo> inputs;
531   for (int n = 0; n < inputs_count; ++n) {
532     InputLayerInfo input;
533     CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
534         << input_layer_types[n] << " was an invalid type";
535     std::vector<int32> sizes;
536     CHECK(str_util::SplitAndParseAsInts(input_layer_shapes[n], ',', &sizes))
537         << "Incorrect size string specified: " << input_layer_shapes[n];
538     for (int i = 0; i < sizes.size(); ++i) {
539       int32 size = sizes[i];
540       if (size == -1) {
541         LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
542                    << " with the size you want to benchmark with.";
543         return -1;
544       }
545       input.shape.AddDim(sizes[i]);
546     }
547     input.name = input_layers[n];
548     if (n < input_layer_values.size()) {
549       CHECK(str_util::SplitAndParseAsFloats(input_layer_values[n], ',',
550                                             &input.initialization_values))
551           << "Incorrect initialization values string specified: "
552           << input_layer_values[n];
553     }
554     inputs.push_back(input);
555   }
556 
557   // If requested, run through the graph first to preinitialize everything
558   // before the benchmarking runs.
559   int64 warmup_time_us = 0;
560   int64 num_warmup_runs = 0;
561   if (warmup_runs > 0) {
562     Status warmup_time_status =
563         TimeMultipleRuns(inter_inference_sleep_seconds, warmup_runs, -1.0,
564                          inputs, output_layers, target_layers, session.get(),
565                          nullptr, &warmup_time_us, &num_warmup_runs);
566     if (!warmup_time_status.ok()) {
567       LOG(ERROR) << "Timing failed with " << warmup_time_status;
568       return -1;
569     }
570   }
571 
572   // Capture overall inference time without stat logging overhead. This is the
573   // timing data that can be compared to other libraries.
574   SleepSeconds(inter_benchmark_sleep_seconds);
575   int64 no_stat_time_us = 0;
576   int64 no_stat_num_runs = 0;
577   Status no_stat_time_status = TimeMultipleRuns(
578       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
579       inputs, output_layers, target_layers, session.get(), nullptr,
580       &no_stat_time_us, &no_stat_num_runs);
581   const double no_stat_wall_time = no_stat_time_us / 1000000.0;
582   if (!no_stat_time_status.ok()) {
583     LOG(ERROR) << "Timing failed with " << no_stat_time_status;
584     return -1;
585   }
586 
587   // Run again to gather detailed log stats to get a better idea of where
588   // relative time is going within the graph.
589   SleepSeconds(inter_benchmark_sleep_seconds);
590   int64 stat_time_us = 0;
591   int64 stat_num_runs = 0;
592   Status stat_time_status = TimeMultipleRuns(
593       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
594       inputs, output_layers, target_layers, session.get(), stats.get(),
595       &stat_time_us, &stat_num_runs);
596   if (!stat_time_status.ok()) {
597     LOG(ERROR) << "Timing failed with " << stat_time_status;
598     return -1;
599   }
600 
601   LOG(INFO) << "Average inference timings in us: "
602             << "Warmup: "
603             << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
604             << "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
605             << "with stats: " << stat_time_us / stat_num_runs;
606 
607   stats->PrintStepStats();
608 
609   if (show_sizes) {
610     stats->PrintOutputs();
611   }
612 
613   if (show_flops) {
614     int64 total_flops;
615     std::unordered_map<string, int64> flops_by_op;
616     Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
617                                         &total_flops, &flops_by_op);
618     if (!flop_status.ok()) {
619       LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
620       return -1;
621     }
622     string pretty_flops;
623     if (total_flops < 1000) {
624       pretty_flops = strings::StrCat(total_flops, " FLOPs");
625     } else if (total_flops < (1000 * 1000)) {
626       const float rounded_flops = (total_flops / 1000.0f);
627       pretty_flops = strings::StrCat(rounded_flops, "k FLOPs");
628     } else if (total_flops < (1000 * 1000 * 1000)) {
629       const float rounded_flops = round(total_flops / 1000.0f) / 1000.0f;
630       pretty_flops = strings::StrCat(rounded_flops, " million FLOPs");
631     } else {
632       const float rounded_flops =
633           round(total_flops / (1000.0f * 1000.0f)) / 1000.0f;
634       pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
635     }
636     LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
637     const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
638     LOG(INFO) << "FLOPs/second: "
639               << strings::HumanReadableNum(
640                      static_cast<int64>(total_flops / mean_run_time));
641   }
642 
643   if (!benchmark_name.empty() && !output_prefix.empty()) {
644     // Compute the total number of values per input.
645     int64 total_size = inputs[0].shape.num_elements();
646 
647     // Throughput in MB/s
648     const double throughput =
649         DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
650         static_cast<double>(no_stat_wall_time) / (1024 * 1024);
651 
652     // Report the stats.
653     RecordBenchmarkEntry(output_prefix, benchmark_name, "", no_stat_num_runs,
654                          no_stat_wall_time, throughput);
655 
656     // Session initialization time.
657     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-init", 1,
658                          initialization_time_s);
659 
660     // First inference time. Note: if warmup_runs is > 1 this will actually be
661     // an average of all the warmup runs.
662     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
663                          warmup_runs, warmup_time_us / 1000000.0);
664 
665     // Time from starting to initialize TF to getting the first result back.
666     // This also assumes that only one warmup run is performed.
667     RecordBenchmarkEntry(
668         output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
669         initialization_time_s + (warmup_time_us / 1000000.0) / warmup_runs);
670 
671     std::map<std::string, int64_t> node_type_map_count;
672     std::map<std::string, int64_t> node_type_map_time;
673     std::map<std::string, int64_t> node_type_map_memory;
674     std::map<std::string, int64_t> node_type_map_times_called;
675 
676     int64_t accumulated_us;
677     stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
678                               &node_type_map_memory,
679                               &node_type_map_times_called, &accumulated_us);
680     for (const auto& time : node_type_map_time) {
681       LOG(INFO) << "Outputting: [" << time.first << "]";
682       RecordBenchmarkEntry(output_prefix, benchmark_name, time.first,
683                            stat_num_runs,
684                            (time.second * stat_num_runs) / 1000000.0f);
685     }
686   }
687 
688   return 0;
689 }
690 
691 }  // namespace benchmark_model
692 }  // namespace tensorflow
693