• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
17 
18 #include <math.h>
19 
20 #include <algorithm>
21 #include <string>
22 #include <vector>
23 
24 #include "google/protobuf/any.pb.h"
25 #include "absl/container/flat_hash_map.h"
26 #include "absl/strings/match.h"
27 #include "absl/strings/str_cat.h"
28 #include "absl/strings/string_view.h"
29 #include "tensorflow/core/lib/gtl/map_util.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
33 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
34 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
35 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
36 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
37 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
38 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
39 #include "tensorflow/core/profiler/utils/diagnostics.h"
40 #include "tensorflow/core/profiler/utils/event_span.h"
41 #include "tensorflow/core/profiler/utils/format_utils.h"
42 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
43 #include "tensorflow/core/profiler/utils/html_utils.h"
44 #include "tensorflow/core/profiler/utils/math_utils.h"
45 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
46 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
47 #include "tensorflow/core/profiler/utils/time_utils.h"
48 #include "tensorflow/core/util/stats_calculator.h"
49 
50 namespace tensorflow {
51 namespace profiler {
52 
53 namespace {
54 
55 const double kNumPsPerMs = 1000000000.0;
56 
57 // If the percentage of step time that is due to infeed is less than
58 // kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
59 // input-bound; else if it is less than
60 // kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
61 // input-bound; else if it is considered HIGHLY input-bound.
62 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
63 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
64 
65 // If the percentage of step time that is due to outfeed is less than
66 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
67 // output-bound; else if it is less than
68 // kHighlyOutfeedBoundThresholdInPercent, it is considered MODERATELY
69 // output-bound; else if it is considered HIGHLY output-bound.
70 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
71 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
72 
73 // If the percentage of step time that is due to kernel launch is less than
74 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
75 // kernel-launch bound; else if it is less than
76 // kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
77 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
78 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
79 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
80 
81 // If the percentage of step time that is due to all other time is less than
82 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
83 // all-other bound; else if it is less than
84 // kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
85 // all-other bound; else if it is considered HIGHLY all-other bound.
86 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
87 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
88 
89 // If the percentage of step time that is due to device collectives is less than
90 // kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
91 // device-collectives bound; else if it is less than
92 // kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
93 // device-collectives  bound; else if it is considered HIGHLY device-collectives
94 // bound.
95 constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
96 constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
97 
98 // Section number of the host-analysis section in the input-pipeline analysis.
99 constexpr int kHostAnalysisSectionNumber = 3;
100 // Python-only explanation for "All Others" time.
101 const char* kAllOthersPythonExplanation =
102     " % of the total step time sampled is spent on 'All Others' time. "
103     "This could be due to Python execution overhead.";
104 // Explanation for "Kernel Launch" time due to CPU contention with tf.data.
105 const char* kKernelLaunchTfDataContention =
106     " It could be due to CPU contention with tf.data. In this case, you may "
107     "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
108 
109 template <class Collection>
GetTimeInMs(const Collection & type_ps,EventType event_type)110 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
111   return PicosToMillis(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
112 }
113 
GetStepSummaryForSampleStats(const Stat<double> & sample_stats)114 StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
115   StepSummary step_time_summary;
116   double avg, sdv, min, max;
117   if (sample_stats.empty()) {
118     // If sample_stats is empty, sample_stats.avg() will return NaN. However, we
119     // prefer to show an 0 instead.
120     avg = sdv = min = max = 0.0;
121   } else {
122     avg = sample_stats.avg();
123     sdv = sqrt(sample_stats.sample_variance());
124     min = sample_stats.min();
125     max = sample_stats.max();
126   }
127   step_time_summary.set_average(avg);
128   step_time_summary.set_standard_deviation(sdv);
129   step_time_summary.set_minimum(min);
130   step_time_summary.set_maximum(max);
131   return step_time_summary;
132 }
133 
ComputeGenericStepTimeBreakdownInMs(const InputPipelineAnalysisResult & analysis)134 GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
135     const InputPipelineAnalysisResult& analysis) {
136   Stat<double> unknown_time_ms;
137   Stat<double> host_wait_input_ms;
138   Stat<double> host_to_device_ms;
139   Stat<double> input_ms;
140   Stat<double> output_ms;
141   Stat<double> device_compute_ms;
142   Stat<double> device_to_device_ms;
143   Stat<double> device_collectives_ms;
144   Stat<double> host_compute_ms;
145   Stat<double> host_prepare_ms;
146   Stat<double> host_compile_ms;
147   GenericStepTimeBreakdown result;
148 
149   for (const google::protobuf::Any& step_details : analysis.step_details()) {
150     PerGenericStepDetails details;
151     bool success = step_details.UnpackTo(&details);
152     if (!success && !step_details.type_url().empty()) {
153       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
154                  << std::endl;
155       return {};
156     }
157     unknown_time_ms.UpdateStat(details.unknown_time_ms());
158     host_wait_input_ms.UpdateStat(details.host_wait_input_ms());
159     host_to_device_ms.UpdateStat(details.host_to_device_ms());
160     input_ms.UpdateStat(details.host_wait_input_ms() +
161                         details.host_to_device_ms());
162     output_ms.UpdateStat(details.output_ms());
163     device_compute_ms.UpdateStat(details.device_compute_ms());
164     device_to_device_ms.UpdateStat(details.device_to_device_ms());
165     device_collectives_ms.UpdateStat(details.device_collectives_ms());
166     host_compute_ms.UpdateStat(details.host_compute_ms());
167     host_prepare_ms.UpdateStat(details.host_prepare_ms());
168     host_compile_ms.UpdateStat(details.host_compile_ms());
169   }
170   *result.mutable_unknown_time_ms_summary() =
171       GetStepSummaryForSampleStats(unknown_time_ms);
172   *result.mutable_host_wait_input_ms_summary() =
173       GetStepSummaryForSampleStats(host_wait_input_ms);
174   *result.mutable_host_to_device_ms_summary() =
175       GetStepSummaryForSampleStats(host_to_device_ms);
176   *result.mutable_input_ms_summary() = GetStepSummaryForSampleStats(input_ms);
177   *result.mutable_output_ms_summary() = GetStepSummaryForSampleStats(output_ms);
178   *result.mutable_device_compute_ms_summary() =
179       GetStepSummaryForSampleStats(device_compute_ms);
180   *result.mutable_device_to_device_ms_summary() =
181       GetStepSummaryForSampleStats(device_to_device_ms);
182   *result.mutable_device_collectives_ms_summary() =
183       GetStepSummaryForSampleStats(device_collectives_ms);
184   *result.mutable_host_compute_ms_summary() =
185       GetStepSummaryForSampleStats(host_compute_ms);
186   *result.mutable_host_prepare_ms_summary() =
187       GetStepSummaryForSampleStats(host_prepare_ms);
188   *result.mutable_host_compile_ms_summary() =
189       GetStepSummaryForSampleStats(host_compile_ms);
190   return result;
191 }
192 
ComputeGenericInputPipelineAnalysisResult(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)193 InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
194     const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
195   InputPipelineAnalysisResult result;
196 
197   // Computes the summary of step time in ms.
198   *result.mutable_step_time_summary() =
199       ComputeStepTimeSummaryInMs(grouped_by_step);
200 
201   Stat<double> input_summary_stats_in_percent;
202   for (const auto& coreid_stepinfo_map : grouped_by_step) {
203     // Iterates over each step.
204     const auto* ptr = gtl::FindOrNull(coreid_stepinfo_map.step_info_per_core(),
205                                       kDefaultGpuLocalCoreId);
206     if (ptr == nullptr) {
207       // For generic hardware, all step-info is put under core-0. If ptr
208       // is nullptr, it means there is no step at all.
209       continue;
210     }
211     const StepInfoResult& step_info = *ptr;
212     // Adds the details for a new step.
213     PerGenericStepDetails details;
214     details.set_step_number(step_info.step_num());
215     if (step_info.step_name().empty()) {
216       details.set_step_name(absl::StrCat(step_info.step_num()));
217     } else {
218       details.set_step_name(step_info.step_name());
219     }
220     details.set_step_time_ms(PicosToMillis(step_info.duration_ps()));
221     GenericStepBreakdown generic;
222     bool success = step_info.step_breakdown().UnpackTo(&generic);
223     if (!success && !step_info.step_breakdown().type_url().empty()) {
224       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
225                  << std::endl;
226       return {};
227     }
228     const auto& type_ps = generic.type_ps();
229     details.set_unknown_time_ms(GetTimeInMs(type_ps, UNKNOWN_TIME));
230     details.set_host_wait_input_ms(GetTimeInMs(type_ps, HOST_WAIT_INPUT));
231     details.set_host_to_device_ms(GetTimeInMs(type_ps, HOST_TO_DEVICE) +
232                                   GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
233     details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
234     details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
235                                   GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
236     details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
237                                     GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
238     details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
239     details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
240     details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
241     details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
242     result.add_step_details()->PackFrom(details);
243 
244     const double input_percent_of_step_time =
245         100.0 *
246         SafeDivide(details.host_wait_input_ms() + details.host_to_device_ms(),
247                    details.step_time_ms());
248     input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
249   }
250 
251   // Computes the summary of input time as percentage of step time.
252   *result.mutable_input_percent_summary() =
253       GetStepSummaryForSampleStats(input_summary_stats_in_percent);
254 
255   // Computes the breakdown of step time.
256   GenericStepTimeBreakdown generic_step_time_breakdown =
257       ComputeGenericStepTimeBreakdownInMs(result);
258   result.mutable_step_time_breakdown()->PackFrom(generic_step_time_breakdown);
259 
260   return result;
261 }
262 
263 // Classification of input processing on the host.
264 enum class InputOpCategory {
265   kEnqueue,           // enqueue data to be transferred to device.
266   kDemandedFileRead,  // demanded read from file.
267   kAdvancedFileRead,  // advanced read from file (including cached,
268                       // prefetch, parallel-map, interleave).
269   kPreprocessing      // data preprocessing.
270 };
271 
InputOpCategoryString(InputOpCategory category)272 std::string InputOpCategoryString(InputOpCategory category) {
273   switch (category) {
274     case InputOpCategory::kEnqueue:
275       return "Enqueue";
276     case InputOpCategory::kDemandedFileRead:
277       return "Demanded file read";
278     case InputOpCategory::kAdvancedFileRead:
279       return "Advanced file read";
280     case InputOpCategory::kPreprocessing:
281       return "Preprocessing";
282   }
283 }
284 
IsInputOp(absl::string_view category)285 inline bool IsInputOp(absl::string_view category) {
286   // Do not include "IteratorGetNext*" here, because IteratorGetNext is an Op
287   // that experiences the install stall, not an Op that causes the input stall.
288   return IsInfeedEnqueueOp(category) || IsDatasetOp(category) ||
289          IsMemcpyHToDOp(category);
290 }
291 
292 // TODO(ckluk):
293 //   Confirm with the tf.data team if the classification below is correct.
CategorizeInputOp(absl::string_view name,absl::string_view category)294 InputOpCategory CategorizeInputOp(absl::string_view name,
295                                   absl::string_view category) {
296   if (IsInfeedEnqueueOp(category) || IsMemcpyHToDOp(category)) {
297     // Ops for sending input from host to device.
298     return InputOpCategory::kEnqueue;
299   }
300   DCHECK(IsDatasetOp(category));
301   if (absl::EndsWith(name, "::TFRecord") ||
302       absl::EndsWith(name, "::TextLine") ||
303       absl::EndsWith(name, "::FixedLengthRecord") ||
304       absl::EndsWith(name, "::SSTable") || absl::EndsWith(name, "::RecordIO")) {
305     // Ops that read files.
306     if (absl::StrContains(name, "::MemoryReader") ||
307         absl::StrContains(name, "::MemoryWriter") ||
308         absl::StrContains(name, "::Interleave") ||
309         absl::StrContains(name, "::Prefetch") ||
310         absl::StrContains(name, "::ParallelMap")) {
311       // Ops that read files in advance, including caching, interleaving, and
312       // prefetching.
313       return InputOpCategory::kAdvancedFileRead;
314     } else {
315       // Ops that read files on demand.
316       return InputOpCategory::kDemandedFileRead;
317     }
318   } else {
319     // All other ops are classified as preprocessing.
320     return InputOpCategory::kPreprocessing;
321   }
322 }
323 
324 struct InputOpMetrics {
325   std::vector<const OpMetrics*> input_op_metrics;
326   uint64 input_op_time_ps = 0;
327 };
328 
SelectInputOpMetrics(const OpMetricsDb & all_op_metrics)329 InputOpMetrics SelectInputOpMetrics(const OpMetricsDb& all_op_metrics) {
330   InputOpMetrics input_op_metrics;
331   for (const OpMetrics* op_metrics : SortedOpMetricsDb(all_op_metrics)) {
332     if (IsInputOp(op_metrics->category())) {
333       input_op_metrics.input_op_metrics.push_back(op_metrics);
334       input_op_metrics.input_op_time_ps += op_metrics->self_time_ps();
335     }
336   }
337   return input_op_metrics;
338 }
339 
ConvertOpMetricsToInputOpDetails(const OpMetrics & op_metrics,uint64 input_op_time_ps,InputOpCategory category)340 InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
341                                                 uint64 input_op_time_ps,
342                                                 InputOpCategory category) {
343   InputOpDetails details;
344   details.set_op_name(op_metrics.name());
345   details.set_count(op_metrics.occurrences());
346   details.set_time_in_ms(PicosToMillis(op_metrics.time_ps()));
347   details.set_self_time_in_ms(PicosToMillis(op_metrics.self_time_ps()));
348   details.set_time_in_percent(
349       100.0 * SafeDivide(op_metrics.time_ps(), input_op_time_ps));
350   details.set_self_time_in_percent(
351       100.0 * SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
352   details.set_category(InputOpCategoryString(category));
353   return details;
354 }
355 
356 // Returns the ratio of the host-to-device time in each step to the step-time.
RatioOfHostToDeviceTimeToStepTime(const OpMetricsDb & host_tf_metrics_db,const InputPipelineAnalysisResult & input_pipeline_analysis)357 double RatioOfHostToDeviceTimeToStepTime(
358     const OpMetricsDb& host_tf_metrics_db,
359     const InputPipelineAnalysisResult& input_pipeline_analysis) {
360   // For TPU execution that uses infeed.
361   absl::optional<double> host_infeed_enqueue_ratio =
362       HostInfeedEnqueueRatio(host_tf_metrics_db);
363   if (host_infeed_enqueue_ratio.has_value()) {
364     return host_infeed_enqueue_ratio.value();
365   }
366   // For GPU and TPU execution that do not use infeed.
367   double avg_step_time_ms =
368       input_pipeline_analysis.step_time_summary().average();
369   if (avg_step_time_ms > 0) {
370     // Uses the on-device step time.
371     GenericStepTimeBreakdown generic_breakdown;
372     if (input_pipeline_analysis.step_time_breakdown().UnpackTo(
373             &generic_breakdown)) {
374       double avg_host_to_device_time_ms =
375           generic_breakdown.host_to_device_ms_summary().average();
376       return SafeDivide(avg_host_to_device_time_ms, avg_step_time_ms);
377     }
378   }
379   return 0.0;
380 }
381 
DeviceCollectivesAnalysis(double device_collectives_percent,std::string * device_collectives_classification,std::string * device_collectives_statement)382 void DeviceCollectivesAnalysis(double device_collectives_percent,
383                                std::string* device_collectives_classification,
384                                std::string* device_collectives_statement) {
385   if (device_collectives_percent >=
386       kHighlyDeviceCollectivesBoundThresholdInPercent) {
387     *device_collectives_classification = "high";
388     *device_collectives_statement =
389         absl::StrCat(OneDigit(device_collectives_percent),
390                      " % of the total step time sampled is spent on 'Device "
391                      "Collective Communication'.");
392   } else if (device_collectives_percent >=
393              kModeratelyDeviceCollectivesBoundThresholdInPercent) {
394     *device_collectives_classification = "moderate";
395     *device_collectives_statement =
396         absl::StrCat(OneDigit(device_collectives_percent),
397                      " % of the total step time sampled is spent on 'Device "
398                      "Collective Communication'.");
399   } else {
400     *device_collectives_classification = "no";
401     *device_collectives_statement = "";
402   }
403 }
404 
KernelLaunchAnalysis(bool tfdata_used,double kernel_launch_percent,std::string * kernel_launch_classification,std::string * kernel_launch_statement)405 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
406                           std::string* kernel_launch_classification,
407                           std::string* kernel_launch_statement) {
408   if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
409     *kernel_launch_classification = "high";
410     *kernel_launch_statement = absl::StrCat(
411         OneDigit(kernel_launch_percent),
412         " % of the total step time sampled is spent on 'Kernel Launch'.");
413     if (tfdata_used) {
414       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
415     }
416   } else if (kernel_launch_percent >=
417              kModeratelyKernelLaunchBoundThresholdInPercent) {
418     *kernel_launch_classification = "moderate";
419     *kernel_launch_statement = absl::StrCat(
420         OneDigit(kernel_launch_percent),
421         " % of the total step time sampled is spent on 'Kernel Launch'.");
422     if (tfdata_used) {
423       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
424     }
425   } else {
426     *kernel_launch_classification = "no";
427     *kernel_launch_statement = "";
428   }
429 }
430 
AllOtherAnalysis(bool all_other_reported,double all_other_percent,std::string * all_other_classification,std::string * all_other_statement)431 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
432                       std::string* all_other_classification,
433                       std::string* all_other_statement) {
434   if (all_other_reported) {
435     *all_other_classification = "no";
436     *all_other_statement = "";
437     return;
438   }
439   if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
440     *all_other_classification = "high";
441     *all_other_statement =
442         absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
443   } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
444     *all_other_classification = "moderate";
445     *all_other_statement =
446         absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
447   } else {
448     *all_other_classification = "no";
449     *all_other_statement = "";
450   }
451 }
452 
453 // Tests if tf.data API is in use.
TfDataInUse(const InputTimeBreakdown & breakdown)454 bool TfDataInUse(const InputTimeBreakdown& breakdown) {
455   // Do not include enqueue_us because the "enqueue" Op that Xprof recognizes is
456   // not part of tf.data.
457   return breakdown.demanded_file_read_us() > 0 ||
458          breakdown.advanced_file_read_us() > 0 ||
459          breakdown.preprocessing_us() > 0;
460 }
461 
462 // Returns a HTML link with the given text.
MakeDocLink(absl::string_view doc_link,absl::string_view text)463 std::string MakeDocLink(absl::string_view doc_link, absl::string_view text) {
464   return absl::StrCat("<a href=\"", doc_link, "\" target=\"_blank\">", text,
465                       "</a>");
466 }
467 
468 // Returns the HTML link to the introduction to the tf.data API.
DatasetIntroDoc()469 std::string DatasetIntroDoc() {
470   return "https://www.tensorflow.org/guide/data";
471 }
472 
473 }  // namespace
474 
GenerateHostResult(const OpMetricsDb & host_tf_metrics_db,InputPipelineAnalysisResult * result)475 void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
476                         InputPipelineAnalysisResult* result) {
477   InputOpMetrics input_op_metrics = SelectInputOpMetrics(host_tf_metrics_db);
478   // Returns if the program is not using an input pipeline with
479   // instrumentation and hence no input ops are found.
480   if (input_op_metrics.input_op_metrics.empty()) return;
481 
482   absl::flat_hash_map<InputOpCategory, double> aggregated_input_op_times_us;
483   for (const OpMetrics* op_metrics : input_op_metrics.input_op_metrics) {
484     InputOpCategory category =
485         CategorizeInputOp(op_metrics->name(), op_metrics->category());
486     *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
487         *op_metrics, input_op_metrics.input_op_time_ps, category);
488     aggregated_input_op_times_us[category] +=
489         PicosToMicros(op_metrics->self_time_ps());
490   }
491 
492   double enqueue_time_us =
493       aggregated_input_op_times_us[InputOpCategory::kEnqueue];
494   double total_input_op_time_us =
495       aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead] +
496       aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead] +
497       aggregated_input_op_times_us[InputOpCategory::kPreprocessing];
498 
499   double ratio = std::min(
500       1.0, RatioOfHostToDeviceTimeToStepTime(host_tf_metrics_db, *result));
501   DCHECK_GE(ratio, 0.0);
502   double non_enqueue_time_us = (ratio != 0.0)
503                                    ? (enqueue_time_us * (1.0 - ratio) / ratio)
504                                    : total_input_op_time_us;
505 
506   // Scales the various input-time components wrt to non_enqueue_time_us.
507   double scaled_demanded_fileread_time_us = SafeDivide(
508       non_enqueue_time_us *
509           aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
510       total_input_op_time_us);
511   double scaled_advanced_fileread_time_us = SafeDivide(
512       non_enqueue_time_us *
513           aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
514       total_input_op_time_us);
515   double scaled_preprocessing_time_us = SafeDivide(
516       non_enqueue_time_us *
517           aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
518       total_input_op_time_us);
519   double unclassified_non_enqueue_time_us = std::max(
520       0.0, non_enqueue_time_us - scaled_demanded_fileread_time_us -
521                scaled_advanced_fileread_time_us - scaled_preprocessing_time_us);
522 
523   InputTimeBreakdown* input_time_breakdown =
524       result->mutable_input_time_breakdown();
525   input_time_breakdown->set_enqueue_us(enqueue_time_us);
526   input_time_breakdown->set_demanded_file_read_us(
527       scaled_demanded_fileread_time_us);
528   input_time_breakdown->set_advanced_file_read_us(
529       scaled_advanced_fileread_time_us);
530   input_time_breakdown->set_preprocessing_us(scaled_preprocessing_time_us);
531   input_time_breakdown->set_unclassified_non_enqueue_us(
532       unclassified_non_enqueue_time_us);
533 }
534 
GenerateRecommendation()535 InputPipelineAnalysisRecommendation GenerateRecommendation() {
536   const absl::string_view kDatasetIntro =
537       "https://www.tensorflow.org/programmers_guide/datasets";
538 
539   const absl::string_view kDatasetTopic =
540       "https://www.tensorflow.org/api_docs/python/tf/data/Dataset#";
541 
542   const absl::string_view kTfRecordDataset =
543       "https://www.tensorflow.org/api_docs/python/tf/data/"
544       "TFRecordDataset#class_tfrecorddataset";
545 
546   InputPipelineAnalysisRecommendation recommendation;
547   *recommendation.add_details() =
548       "Enqueuing data: you may want to combine small input data chunks "
549       "into fewer "
550       "but larger chunks.";
551   *recommendation.add_details() = absl::StrCat(
552       "Data preprocessing: you may increase num_parallel_calls in ",
553       AnchorElement(absl::StrCat(kDatasetTopic, "map"), "Dataset map()"),
554       " or preprocess the data OFFLINE.");
555   *recommendation.add_details() = absl::StrCat(
556       "Reading data from files in advance: you may tune parameters in the "
557       "following tf.data API (",
558       AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch size"),
559       ", ",
560       AnchorElement(absl::StrCat(kDatasetTopic, "interleave"),
561                     "interleave cycle_length"),
562       ", ", AnchorElement(kTfRecordDataset, "reader buffer_size"), ")");
563   *recommendation.add_details() = absl::StrCat(
564       "Reading data from files on demand: you should read data IN ADVANCE "
565       "using the following tf.data API (",
566       AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch"), ", ",
567       AnchorElement(absl::StrCat(kDatasetTopic, "interleave"), "interleave"),
568       ", ", AnchorElement(kTfRecordDataset, "reader buffer"), ")");
569   *recommendation.add_details() = absl::StrCat(
570       "Other data reading or processing: you may consider using the ",
571       AnchorElement(kDatasetIntro, "tf.data API"),
572       " (if you are not using it now)");
573 
574   return recommendation;
575 }
576 
ComputeStepTimeSummaryInMs(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)577 StepSummary ComputeStepTimeSummaryInMs(
578     const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
579   Stat<double> total_step_stats_in_ms;
580   // iterates over each step.
581   for (const auto& coreid_stepinfo_map : grouped_by_step) {
582     double max_per_step_stats_in_ms = 0.0;
583     // iterates over each core.
584     for (const auto& coreid_and_stepinfo :
585          coreid_stepinfo_map.step_info_per_core()) {
586       const auto& step_info = coreid_and_stepinfo.second;
587       max_per_step_stats_in_ms = std::max(step_info.duration_ps() / kNumPsPerMs,
588                                           max_per_step_stats_in_ms);
589     }
590     // Step time of each step is determined by the slowest core.
591     total_step_stats_in_ms.UpdateStat(max_per_step_stats_in_ms);
592   }
593 
594   return GetStepSummaryForSampleStats(total_step_stats_in_ms);
595 }
596 
ConvertOpStatsToInputPipelineAnalysis(const OpStats & op_stats)597 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
598     const OpStats& op_stats) {
599   InputPipelineAnalysisResult result =
600       ComputeGenericInputPipelineAnalysisResult(
601           op_stats.step_db().step_sequence());
602   PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
603   result.set_hardware_type(HardwareType_Name(
604       ParseHardwareType(op_stats.run_environment().device_type())));
605   GenerateHostResult(op_stats.host_op_metrics_db(), &result);
606 
607   InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
608   BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
609       result.input_time_breakdown(), result.step_details());
610   result.set_input_percent(bottleneck_analysis.input_percent());
611   result.set_output_percent(bottleneck_analysis.output_percent());
612   result.set_idle_percent(bottleneck_analysis.idle_percent());
613   result.set_compute_percent(bottleneck_analysis.compute_percent());
614 
615   recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
616   *recommendation.mutable_summary_next_step() =
617       GetSummaryNextStep(bottleneck_analysis.input_classification(),
618                          result.input_time_breakdown());
619 
620   *result.mutable_recommendation() = recommendation;
621   return result;
622 }
623 
InputAnalysis(double input_percent,double all_other_percent,std::string * input_classification,std::string * input_statement)624 bool InputAnalysis(double input_percent, double all_other_percent,
625                    std::string* input_classification,
626                    std::string* input_statement) {
627   absl::string_view non_input_time = "other time";
628   if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
629     *input_classification = "host";
630     *input_statement = absl::StrCat(
631         "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
632         "% of the total step time sampled is waiting for input. Therefore, you "
633         "should first focus on reducing the input time.");
634     return false;
635   } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
636     *input_classification = "both";
637     *input_statement = absl::StrCat(
638         "Your program is MODERATELY input-bound because ",
639         OneDigit(input_percent),
640         "% of the total step time sampled is waiting for input. Therefore, "
641         "you would need to reduce both the input time and ",
642         non_input_time, ".");
643     return false;
644   } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
645     // Input analysis says it is not input-bound, but "All-Other" time
646     // is significant. It could still be input-bound (or Python overhead).
647     *input_classification = "both";
648     *input_statement = absl::StrCat(
649         "Your program is POTENTIALLY input-bound because ",
650         OneDigit(all_other_percent),
651         "% of the total step time sampled is spent on 'All Others' time (which "
652         "could be due to I/O or Python execution or both).");
653     return true;
654   } else {
655     // Defintely not input-bound.
656     *input_classification = "device";
657     *input_statement =
658         absl::StrCat("Your program is NOT input-bound because only ",
659                      OneDigit(input_percent),
660                      "% of the total step time sampled is waiting for "
661                      "input. Therefore, you should focus on "
662                      "reducing ",
663                      non_input_time, ".");
664     return false;
665   }
666 }
667 
OutputAnalysis(double output_percent,std::string * output_classification,std::string * output_statement)668 void OutputAnalysis(double output_percent, std::string* output_classification,
669                     std::string* output_statement) {
670   if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
671     *output_classification = "host";
672     *output_statement = absl::StrCat(
673         "Your program is HIGHLY output-bound because ",
674         OneDigit(output_percent),
675         "% of the total step time sampled is spent on output. Therefore, you "
676         "should first focus on reducing the output time.");
677   } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
678     *output_classification = "both";
679     *output_statement = absl::StrCat(
680         "Your program is MODERATELY output-bound because ",
681         OneDigit(output_percent),
682         "% of the total step time sampled is spent on output. Therefore, "
683         "you would need to reduce both the output time and other time.");
684   } else {
685     *output_classification = "device";
686     *output_statement = "";
687   }
688 }
689 
ComputeBottleneckAnalysis(const InputTimeBreakdown & input_time_breakdown,const::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any> & any_step_details)690 BottleneckAnalysis ComputeBottleneckAnalysis(
691     const InputTimeBreakdown& input_time_breakdown,
692     const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
693         any_step_details) {
694   double total_step_time_ms = 0;
695   double total_input_ms = 0;
696   double total_output_ms = 0;
697   double total_host_compute_ms = 0;
698   double total_host_prepare_ms = 0;
699   double total_host_compile_ms = 0;
700   double total_device_compute_ms = 0;
701   double total_device_to_device_ms = 0;
702   double total_device_collectives_ms = 0;
703   double total_unknown_ms = 0;
704 
705   for (const google::protobuf::Any& step_details : any_step_details) {
706     PerGenericStepDetails details;
707     bool success = step_details.UnpackTo(&details);
708     if (!success && !step_details.type_url().empty()) {
709       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
710                  << std::endl;
711       return {};
712     }
713     total_step_time_ms += details.step_time_ms();
714     total_input_ms +=
715         details.host_wait_input_ms() + details.host_to_device_ms();
716     total_output_ms += details.output_ms();
717     total_host_prepare_ms += details.host_prepare_ms();
718     total_device_compute_ms += details.device_compute_ms();
719     total_device_to_device_ms += details.device_to_device_ms();
720     total_device_collectives_ms += details.device_collectives_ms();
721     total_host_compute_ms += details.host_compute_ms();
722     total_host_compile_ms += details.host_compile_ms();
723     total_unknown_ms += details.unknown_time_ms();
724   }
725 
726   if (total_step_time_ms == 0) {
727     BottleneckAnalysis analysis;
728     analysis.set_input_classification("unknown");
729     analysis.set_input_statement(
730         "No step time measured. Therefore we cannot tell where the "
731         "performance bottleneck is.");
732     analysis.set_kernel_launch_classification("no");
733     analysis.set_kernel_launch_statement("");
734     analysis.set_all_other_classification("no");
735     analysis.set_all_other_statement("");
736     analysis.set_device_collectives_classification("no");
737     analysis.set_device_collectives_statement("");
738     return analysis;
739   }
740   double input_percent = 100.0 * total_input_ms / total_step_time_ms;
741   double output_percent = 100.0 * total_output_ms / total_step_time_ms;
742   double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
743   double device_collectives_percent =
744       100.0 * total_device_collectives_ms / total_step_time_ms;
745 
746   // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
747   // host compute, host compile, and unknown.
748   double idle_percent =
749       std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
750                         device_collectives_percent);
751   double kernel_launch_percent =
752       100.0 * total_host_prepare_ms / total_step_time_ms;
753   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
754 
755   std::string input_classification;
756   std::string input_statement;
757   bool all_other_reported =
758       InputAnalysis(input_percent, all_other_percent, &input_classification,
759                     &input_statement);
760 
761   std::string device_collectives_classification;
762   std::string device_collectives_statement;
763   DeviceCollectivesAnalysis(device_collectives_percent,
764                             &device_collectives_classification,
765                             &device_collectives_statement);
766 
767   std::string kernel_launch_classification;
768   std::string kernel_launch_statement;
769   KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
770                        &kernel_launch_classification, &kernel_launch_statement);
771 
772   std::string all_other_classification;
773   std::string all_other_statement;
774   AllOtherAnalysis(all_other_reported, all_other_percent,
775                    &all_other_classification, &all_other_statement);
776 
777   BottleneckAnalysis analysis;
778   analysis.set_input_percent(input_percent);
779   analysis.set_output_percent(output_percent);
780   analysis.set_idle_percent(idle_percent);
781   analysis.set_compute_percent(compute_percent);
782 
783   analysis.set_input_classification(input_classification);
784   analysis.set_input_statement(input_statement);
785   analysis.set_kernel_launch_classification(kernel_launch_classification);
786   analysis.set_kernel_launch_statement(kernel_launch_statement);
787   analysis.set_all_other_classification(all_other_classification);
788   analysis.set_all_other_statement(all_other_statement);
789   analysis.set_device_collectives_classification(
790       device_collectives_classification);
791   analysis.set_device_collectives_statement(device_collectives_statement);
792 
793   return analysis;
794 }
795 
GetSummaryNextStep(absl::string_view input_classification,const InputTimeBreakdown & breakdown)796 std::string GetSummaryNextStep(absl::string_view input_classification,
797                                const InputTimeBreakdown& breakdown) {
798   std::string summary_next_step;
799   if (input_classification == "host" || input_classification == "both") {
800     if (!TfDataInUse(breakdown)) {
801       summary_next_step = absl::StrCat(
802           "Consider using ", MakeDocLink(DatasetIntroDoc(), "the tf.data API"),
803           " to enable profiler's host-side analysis for input pipeline. "
804           "Profiler currently does not support custom input pipeline (please "
805           "ignore "
806           "Section ",
807           kHostAnalysisSectionNumber, " below).");
808     } else {
809       summary_next_step =
810           absl::StrCat("Look at Section ", kHostAnalysisSectionNumber,
811                        " for the breakdown of input time on the host.");
812     }
813   } else {
814     summary_next_step = "You may skip the rest of this page.";
815   }
816 
817   return summary_next_step;
818 }
819 
HostToDeviceTransferAsPercentOfInputTime(const InputTimeBreakdown & breakdown)820 double HostToDeviceTransferAsPercentOfInputTime(
821     const InputTimeBreakdown& breakdown) {
822   // Thanks to the scaling trick we did in GenerateHostResult(), we can
823   // estimate the percentage of input-time spent on host-to-device transfer in
824   // the following way.
825   double total_input_time_us =
826       breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
827       breakdown.preprocessing_us() + breakdown.enqueue_us() +
828       breakdown.unclassified_non_enqueue_us();
829   return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
830 }
831 
832 }  // namespace profiler
833 }  // namespace tensorflow
834