1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
17
18 #include <math.h>
19
20 #include <algorithm>
21 #include <string>
22 #include <vector>
23
24 #include "google/protobuf/any.pb.h"
25 #include "absl/container/flat_hash_map.h"
26 #include "absl/strings/match.h"
27 #include "absl/strings/str_cat.h"
28 #include "absl/strings/string_view.h"
29 #include "tensorflow/core/lib/gtl/map_util.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
33 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
34 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
35 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
36 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
37 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
38 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
39 #include "tensorflow/core/profiler/utils/diagnostics.h"
40 #include "tensorflow/core/profiler/utils/event_span.h"
41 #include "tensorflow/core/profiler/utils/format_utils.h"
42 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
43 #include "tensorflow/core/profiler/utils/html_utils.h"
44 #include "tensorflow/core/profiler/utils/math_utils.h"
45 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
46 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
47 #include "tensorflow/core/profiler/utils/time_utils.h"
48 #include "tensorflow/core/util/stats_calculator.h"
49
50 namespace tensorflow {
51 namespace profiler {
52
53 namespace {
54
55 const double kNumPsPerMs = 1000000000.0;
56
57 // If the percentage of step time that is due to infeed is less than
58 // kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
59 // input-bound; else if it is less than
60 // kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
61 // input-bound; else if it is considered HIGHLY input-bound.
62 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
63 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
64
65 // If the percentage of step time that is due to outfeed is less than
66 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
67 // output-bound; else if it is less than
68 // kHighlyOutfeedBoundThresholdInPercent, it is considered MODERATELY
69 // output-bound; else if it is considered HIGHLY output-bound.
70 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
71 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
72
73 // If the percentage of step time that is due to kernel launch is less than
74 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
75 // kernel-launch bound; else if it is less than
76 // kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
77 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
78 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
79 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
80
81 // If the percentage of step time that is due to all other time is less than
82 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
83 // all-other bound; else if it is less than
84 // kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
85 // all-other bound; else if it is considered HIGHLY all-other bound.
86 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
87 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
88
89 // If the percentage of step time that is due to device collectives is less than
90 // kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
91 // device-collectives bound; else if it is less than
92 // kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
93 // device-collectives bound; else if it is considered HIGHLY device-collectives
94 // bound.
95 constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
96 constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
97
98 // Section number of the host-analysis section in the input-pipeline analysis.
99 constexpr int kHostAnalysisSectionNumber = 3;
100 // Python-only explanation for "All Others" time.
101 const char* kAllOthersPythonExplanation =
102 " % of the total step time sampled is spent on 'All Others' time. "
103 "This could be due to Python execution overhead.";
104 // Explanation for "Kernel Launch" time due to CPU contention with tf.data.
105 const char* kKernelLaunchTfDataContention =
106 " It could be due to CPU contention with tf.data. In this case, you may "
107 "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
108
109 template <class Collection>
GetTimeInMs(const Collection & type_ps,EventType event_type)110 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
111 return PicosToMillis(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
112 }
113
GetStepSummaryForSampleStats(const Stat<double> & sample_stats)114 StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
115 StepSummary step_time_summary;
116 double avg, sdv, min, max;
117 if (sample_stats.empty()) {
118 // If sample_stats is empty, sample_stats.avg() will return NaN. However, we
119 // prefer to show an 0 instead.
120 avg = sdv = min = max = 0.0;
121 } else {
122 avg = sample_stats.avg();
123 sdv = sqrt(sample_stats.sample_variance());
124 min = sample_stats.min();
125 max = sample_stats.max();
126 }
127 step_time_summary.set_average(avg);
128 step_time_summary.set_standard_deviation(sdv);
129 step_time_summary.set_minimum(min);
130 step_time_summary.set_maximum(max);
131 return step_time_summary;
132 }
133
ComputeGenericStepTimeBreakdownInMs(const InputPipelineAnalysisResult & analysis)134 GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
135 const InputPipelineAnalysisResult& analysis) {
136 Stat<double> unknown_time_ms;
137 Stat<double> host_wait_input_ms;
138 Stat<double> host_to_device_ms;
139 Stat<double> input_ms;
140 Stat<double> output_ms;
141 Stat<double> device_compute_ms;
142 Stat<double> device_to_device_ms;
143 Stat<double> device_collectives_ms;
144 Stat<double> host_compute_ms;
145 Stat<double> host_prepare_ms;
146 Stat<double> host_compile_ms;
147 GenericStepTimeBreakdown result;
148
149 for (const google::protobuf::Any& step_details : analysis.step_details()) {
150 PerGenericStepDetails details;
151 bool success = step_details.UnpackTo(&details);
152 if (!success && !step_details.type_url().empty()) {
153 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
154 << std::endl;
155 return {};
156 }
157 unknown_time_ms.UpdateStat(details.unknown_time_ms());
158 host_wait_input_ms.UpdateStat(details.host_wait_input_ms());
159 host_to_device_ms.UpdateStat(details.host_to_device_ms());
160 input_ms.UpdateStat(details.host_wait_input_ms() +
161 details.host_to_device_ms());
162 output_ms.UpdateStat(details.output_ms());
163 device_compute_ms.UpdateStat(details.device_compute_ms());
164 device_to_device_ms.UpdateStat(details.device_to_device_ms());
165 device_collectives_ms.UpdateStat(details.device_collectives_ms());
166 host_compute_ms.UpdateStat(details.host_compute_ms());
167 host_prepare_ms.UpdateStat(details.host_prepare_ms());
168 host_compile_ms.UpdateStat(details.host_compile_ms());
169 }
170 *result.mutable_unknown_time_ms_summary() =
171 GetStepSummaryForSampleStats(unknown_time_ms);
172 *result.mutable_host_wait_input_ms_summary() =
173 GetStepSummaryForSampleStats(host_wait_input_ms);
174 *result.mutable_host_to_device_ms_summary() =
175 GetStepSummaryForSampleStats(host_to_device_ms);
176 *result.mutable_input_ms_summary() = GetStepSummaryForSampleStats(input_ms);
177 *result.mutable_output_ms_summary() = GetStepSummaryForSampleStats(output_ms);
178 *result.mutable_device_compute_ms_summary() =
179 GetStepSummaryForSampleStats(device_compute_ms);
180 *result.mutable_device_to_device_ms_summary() =
181 GetStepSummaryForSampleStats(device_to_device_ms);
182 *result.mutable_device_collectives_ms_summary() =
183 GetStepSummaryForSampleStats(device_collectives_ms);
184 *result.mutable_host_compute_ms_summary() =
185 GetStepSummaryForSampleStats(host_compute_ms);
186 *result.mutable_host_prepare_ms_summary() =
187 GetStepSummaryForSampleStats(host_prepare_ms);
188 *result.mutable_host_compile_ms_summary() =
189 GetStepSummaryForSampleStats(host_compile_ms);
190 return result;
191 }
192
ComputeGenericInputPipelineAnalysisResult(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)193 InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
194 const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
195 InputPipelineAnalysisResult result;
196
197 // Computes the summary of step time in ms.
198 *result.mutable_step_time_summary() =
199 ComputeStepTimeSummaryInMs(grouped_by_step);
200
201 Stat<double> input_summary_stats_in_percent;
202 for (const auto& coreid_stepinfo_map : grouped_by_step) {
203 // Iterates over each step.
204 const auto* ptr = gtl::FindOrNull(coreid_stepinfo_map.step_info_per_core(),
205 kDefaultGpuLocalCoreId);
206 if (ptr == nullptr) {
207 // For generic hardware, all step-info is put under core-0. If ptr
208 // is nullptr, it means there is no step at all.
209 continue;
210 }
211 const StepInfoResult& step_info = *ptr;
212 // Adds the details for a new step.
213 PerGenericStepDetails details;
214 details.set_step_number(step_info.step_num());
215 if (step_info.step_name().empty()) {
216 details.set_step_name(absl::StrCat(step_info.step_num()));
217 } else {
218 details.set_step_name(step_info.step_name());
219 }
220 details.set_step_time_ms(PicosToMillis(step_info.duration_ps()));
221 GenericStepBreakdown generic;
222 bool success = step_info.step_breakdown().UnpackTo(&generic);
223 if (!success && !step_info.step_breakdown().type_url().empty()) {
224 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
225 << std::endl;
226 return {};
227 }
228 const auto& type_ps = generic.type_ps();
229 details.set_unknown_time_ms(GetTimeInMs(type_ps, UNKNOWN_TIME));
230 details.set_host_wait_input_ms(GetTimeInMs(type_ps, HOST_WAIT_INPUT));
231 details.set_host_to_device_ms(GetTimeInMs(type_ps, HOST_TO_DEVICE) +
232 GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
233 details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
234 details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
235 GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
236 details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
237 GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
238 details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
239 details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
240 details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
241 details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
242 result.add_step_details()->PackFrom(details);
243
244 const double input_percent_of_step_time =
245 100.0 *
246 SafeDivide(details.host_wait_input_ms() + details.host_to_device_ms(),
247 details.step_time_ms());
248 input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
249 }
250
251 // Computes the summary of input time as percentage of step time.
252 *result.mutable_input_percent_summary() =
253 GetStepSummaryForSampleStats(input_summary_stats_in_percent);
254
255 // Computes the breakdown of step time.
256 GenericStepTimeBreakdown generic_step_time_breakdown =
257 ComputeGenericStepTimeBreakdownInMs(result);
258 result.mutable_step_time_breakdown()->PackFrom(generic_step_time_breakdown);
259
260 return result;
261 }
262
263 // Classification of input processing on the host.
264 enum class InputOpCategory {
265 kEnqueue, // enqueue data to be transferred to device.
266 kDemandedFileRead, // demanded read from file.
267 kAdvancedFileRead, // advanced read from file (including cached,
268 // prefetch, parallel-map, interleave).
269 kPreprocessing // data preprocessing.
270 };
271
InputOpCategoryString(InputOpCategory category)272 std::string InputOpCategoryString(InputOpCategory category) {
273 switch (category) {
274 case InputOpCategory::kEnqueue:
275 return "Enqueue";
276 case InputOpCategory::kDemandedFileRead:
277 return "Demanded file read";
278 case InputOpCategory::kAdvancedFileRead:
279 return "Advanced file read";
280 case InputOpCategory::kPreprocessing:
281 return "Preprocessing";
282 }
283 }
284
IsInputOp(absl::string_view category)285 inline bool IsInputOp(absl::string_view category) {
286 // Do not include "IteratorGetNext*" here, because IteratorGetNext is an Op
287 // that experiences the install stall, not an Op that causes the input stall.
288 return IsInfeedEnqueueOp(category) || IsDatasetOp(category) ||
289 IsMemcpyHToDOp(category);
290 }
291
292 // TODO(ckluk):
293 // Confirm with the tf.data team if the classification below is correct.
CategorizeInputOp(absl::string_view name,absl::string_view category)294 InputOpCategory CategorizeInputOp(absl::string_view name,
295 absl::string_view category) {
296 if (IsInfeedEnqueueOp(category) || IsMemcpyHToDOp(category)) {
297 // Ops for sending input from host to device.
298 return InputOpCategory::kEnqueue;
299 }
300 DCHECK(IsDatasetOp(category));
301 if (absl::EndsWith(name, "::TFRecord") ||
302 absl::EndsWith(name, "::TextLine") ||
303 absl::EndsWith(name, "::FixedLengthRecord") ||
304 absl::EndsWith(name, "::SSTable") || absl::EndsWith(name, "::RecordIO")) {
305 // Ops that read files.
306 if (absl::StrContains(name, "::MemoryReader") ||
307 absl::StrContains(name, "::MemoryWriter") ||
308 absl::StrContains(name, "::Interleave") ||
309 absl::StrContains(name, "::Prefetch") ||
310 absl::StrContains(name, "::ParallelMap")) {
311 // Ops that read files in advance, including caching, interleaving, and
312 // prefetching.
313 return InputOpCategory::kAdvancedFileRead;
314 } else {
315 // Ops that read files on demand.
316 return InputOpCategory::kDemandedFileRead;
317 }
318 } else {
319 // All other ops are classified as preprocessing.
320 return InputOpCategory::kPreprocessing;
321 }
322 }
323
324 struct InputOpMetrics {
325 std::vector<const OpMetrics*> input_op_metrics;
326 uint64 input_op_time_ps = 0;
327 };
328
SelectInputOpMetrics(const OpMetricsDb & all_op_metrics)329 InputOpMetrics SelectInputOpMetrics(const OpMetricsDb& all_op_metrics) {
330 InputOpMetrics input_op_metrics;
331 for (const OpMetrics* op_metrics : SortedOpMetricsDb(all_op_metrics)) {
332 if (IsInputOp(op_metrics->category())) {
333 input_op_metrics.input_op_metrics.push_back(op_metrics);
334 input_op_metrics.input_op_time_ps += op_metrics->self_time_ps();
335 }
336 }
337 return input_op_metrics;
338 }
339
ConvertOpMetricsToInputOpDetails(const OpMetrics & op_metrics,uint64 input_op_time_ps,InputOpCategory category)340 InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
341 uint64 input_op_time_ps,
342 InputOpCategory category) {
343 InputOpDetails details;
344 details.set_op_name(op_metrics.name());
345 details.set_count(op_metrics.occurrences());
346 details.set_time_in_ms(PicosToMillis(op_metrics.time_ps()));
347 details.set_self_time_in_ms(PicosToMillis(op_metrics.self_time_ps()));
348 details.set_time_in_percent(
349 100.0 * SafeDivide(op_metrics.time_ps(), input_op_time_ps));
350 details.set_self_time_in_percent(
351 100.0 * SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
352 details.set_category(InputOpCategoryString(category));
353 return details;
354 }
355
356 // Returns the ratio of the host-to-device time in each step to the step-time.
RatioOfHostToDeviceTimeToStepTime(const OpMetricsDb & host_tf_metrics_db,const InputPipelineAnalysisResult & input_pipeline_analysis)357 double RatioOfHostToDeviceTimeToStepTime(
358 const OpMetricsDb& host_tf_metrics_db,
359 const InputPipelineAnalysisResult& input_pipeline_analysis) {
360 // For TPU execution that uses infeed.
361 absl::optional<double> host_infeed_enqueue_ratio =
362 HostInfeedEnqueueRatio(host_tf_metrics_db);
363 if (host_infeed_enqueue_ratio.has_value()) {
364 return host_infeed_enqueue_ratio.value();
365 }
366 // For GPU and TPU execution that do not use infeed.
367 double avg_step_time_ms =
368 input_pipeline_analysis.step_time_summary().average();
369 if (avg_step_time_ms > 0) {
370 // Uses the on-device step time.
371 GenericStepTimeBreakdown generic_breakdown;
372 if (input_pipeline_analysis.step_time_breakdown().UnpackTo(
373 &generic_breakdown)) {
374 double avg_host_to_device_time_ms =
375 generic_breakdown.host_to_device_ms_summary().average();
376 return SafeDivide(avg_host_to_device_time_ms, avg_step_time_ms);
377 }
378 }
379 return 0.0;
380 }
381
DeviceCollectivesAnalysis(double device_collectives_percent,std::string * device_collectives_classification,std::string * device_collectives_statement)382 void DeviceCollectivesAnalysis(double device_collectives_percent,
383 std::string* device_collectives_classification,
384 std::string* device_collectives_statement) {
385 if (device_collectives_percent >=
386 kHighlyDeviceCollectivesBoundThresholdInPercent) {
387 *device_collectives_classification = "high";
388 *device_collectives_statement =
389 absl::StrCat(OneDigit(device_collectives_percent),
390 " % of the total step time sampled is spent on 'Device "
391 "Collective Communication'.");
392 } else if (device_collectives_percent >=
393 kModeratelyDeviceCollectivesBoundThresholdInPercent) {
394 *device_collectives_classification = "moderate";
395 *device_collectives_statement =
396 absl::StrCat(OneDigit(device_collectives_percent),
397 " % of the total step time sampled is spent on 'Device "
398 "Collective Communication'.");
399 } else {
400 *device_collectives_classification = "no";
401 *device_collectives_statement = "";
402 }
403 }
404
KernelLaunchAnalysis(bool tfdata_used,double kernel_launch_percent,std::string * kernel_launch_classification,std::string * kernel_launch_statement)405 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
406 std::string* kernel_launch_classification,
407 std::string* kernel_launch_statement) {
408 if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
409 *kernel_launch_classification = "high";
410 *kernel_launch_statement = absl::StrCat(
411 OneDigit(kernel_launch_percent),
412 " % of the total step time sampled is spent on 'Kernel Launch'.");
413 if (tfdata_used) {
414 absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
415 }
416 } else if (kernel_launch_percent >=
417 kModeratelyKernelLaunchBoundThresholdInPercent) {
418 *kernel_launch_classification = "moderate";
419 *kernel_launch_statement = absl::StrCat(
420 OneDigit(kernel_launch_percent),
421 " % of the total step time sampled is spent on 'Kernel Launch'.");
422 if (tfdata_used) {
423 absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
424 }
425 } else {
426 *kernel_launch_classification = "no";
427 *kernel_launch_statement = "";
428 }
429 }
430
AllOtherAnalysis(bool all_other_reported,double all_other_percent,std::string * all_other_classification,std::string * all_other_statement)431 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
432 std::string* all_other_classification,
433 std::string* all_other_statement) {
434 if (all_other_reported) {
435 *all_other_classification = "no";
436 *all_other_statement = "";
437 return;
438 }
439 if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
440 *all_other_classification = "high";
441 *all_other_statement =
442 absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
443 } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
444 *all_other_classification = "moderate";
445 *all_other_statement =
446 absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
447 } else {
448 *all_other_classification = "no";
449 *all_other_statement = "";
450 }
451 }
452
453 // Tests if tf.data API is in use.
TfDataInUse(const InputTimeBreakdown & breakdown)454 bool TfDataInUse(const InputTimeBreakdown& breakdown) {
455 // Do not include enqueue_us because the "enqueue" Op that Xprof recognizes is
456 // not part of tf.data.
457 return breakdown.demanded_file_read_us() > 0 ||
458 breakdown.advanced_file_read_us() > 0 ||
459 breakdown.preprocessing_us() > 0;
460 }
461
462 // Returns a HTML link with the given text.
MakeDocLink(absl::string_view doc_link,absl::string_view text)463 std::string MakeDocLink(absl::string_view doc_link, absl::string_view text) {
464 return absl::StrCat("<a href=\"", doc_link, "\" target=\"_blank\">", text,
465 "</a>");
466 }
467
468 // Returns the HTML link to the introduction to the tf.data API.
DatasetIntroDoc()469 std::string DatasetIntroDoc() {
470 return "https://www.tensorflow.org/guide/data";
471 }
472
473 } // namespace
474
GenerateHostResult(const OpMetricsDb & host_tf_metrics_db,InputPipelineAnalysisResult * result)475 void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
476 InputPipelineAnalysisResult* result) {
477 InputOpMetrics input_op_metrics = SelectInputOpMetrics(host_tf_metrics_db);
478 // Returns if the program is not using an input pipeline with
479 // instrumentation and hence no input ops are found.
480 if (input_op_metrics.input_op_metrics.empty()) return;
481
482 absl::flat_hash_map<InputOpCategory, double> aggregated_input_op_times_us;
483 for (const OpMetrics* op_metrics : input_op_metrics.input_op_metrics) {
484 InputOpCategory category =
485 CategorizeInputOp(op_metrics->name(), op_metrics->category());
486 *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
487 *op_metrics, input_op_metrics.input_op_time_ps, category);
488 aggregated_input_op_times_us[category] +=
489 PicosToMicros(op_metrics->self_time_ps());
490 }
491
492 double enqueue_time_us =
493 aggregated_input_op_times_us[InputOpCategory::kEnqueue];
494 double total_input_op_time_us =
495 aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead] +
496 aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead] +
497 aggregated_input_op_times_us[InputOpCategory::kPreprocessing];
498
499 double ratio = std::min(
500 1.0, RatioOfHostToDeviceTimeToStepTime(host_tf_metrics_db, *result));
501 DCHECK_GE(ratio, 0.0);
502 double non_enqueue_time_us = (ratio != 0.0)
503 ? (enqueue_time_us * (1.0 - ratio) / ratio)
504 : total_input_op_time_us;
505
506 // Scales the various input-time components wrt to non_enqueue_time_us.
507 double scaled_demanded_fileread_time_us = SafeDivide(
508 non_enqueue_time_us *
509 aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
510 total_input_op_time_us);
511 double scaled_advanced_fileread_time_us = SafeDivide(
512 non_enqueue_time_us *
513 aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
514 total_input_op_time_us);
515 double scaled_preprocessing_time_us = SafeDivide(
516 non_enqueue_time_us *
517 aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
518 total_input_op_time_us);
519 double unclassified_non_enqueue_time_us = std::max(
520 0.0, non_enqueue_time_us - scaled_demanded_fileread_time_us -
521 scaled_advanced_fileread_time_us - scaled_preprocessing_time_us);
522
523 InputTimeBreakdown* input_time_breakdown =
524 result->mutable_input_time_breakdown();
525 input_time_breakdown->set_enqueue_us(enqueue_time_us);
526 input_time_breakdown->set_demanded_file_read_us(
527 scaled_demanded_fileread_time_us);
528 input_time_breakdown->set_advanced_file_read_us(
529 scaled_advanced_fileread_time_us);
530 input_time_breakdown->set_preprocessing_us(scaled_preprocessing_time_us);
531 input_time_breakdown->set_unclassified_non_enqueue_us(
532 unclassified_non_enqueue_time_us);
533 }
534
GenerateRecommendation()535 InputPipelineAnalysisRecommendation GenerateRecommendation() {
536 const absl::string_view kDatasetIntro =
537 "https://www.tensorflow.org/programmers_guide/datasets";
538
539 const absl::string_view kDatasetTopic =
540 "https://www.tensorflow.org/api_docs/python/tf/data/Dataset#";
541
542 const absl::string_view kTfRecordDataset =
543 "https://www.tensorflow.org/api_docs/python/tf/data/"
544 "TFRecordDataset#class_tfrecorddataset";
545
546 InputPipelineAnalysisRecommendation recommendation;
547 *recommendation.add_details() =
548 "Enqueuing data: you may want to combine small input data chunks "
549 "into fewer "
550 "but larger chunks.";
551 *recommendation.add_details() = absl::StrCat(
552 "Data preprocessing: you may increase num_parallel_calls in ",
553 AnchorElement(absl::StrCat(kDatasetTopic, "map"), "Dataset map()"),
554 " or preprocess the data OFFLINE.");
555 *recommendation.add_details() = absl::StrCat(
556 "Reading data from files in advance: you may tune parameters in the "
557 "following tf.data API (",
558 AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch size"),
559 ", ",
560 AnchorElement(absl::StrCat(kDatasetTopic, "interleave"),
561 "interleave cycle_length"),
562 ", ", AnchorElement(kTfRecordDataset, "reader buffer_size"), ")");
563 *recommendation.add_details() = absl::StrCat(
564 "Reading data from files on demand: you should read data IN ADVANCE "
565 "using the following tf.data API (",
566 AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch"), ", ",
567 AnchorElement(absl::StrCat(kDatasetTopic, "interleave"), "interleave"),
568 ", ", AnchorElement(kTfRecordDataset, "reader buffer"), ")");
569 *recommendation.add_details() = absl::StrCat(
570 "Other data reading or processing: you may consider using the ",
571 AnchorElement(kDatasetIntro, "tf.data API"),
572 " (if you are not using it now)");
573
574 return recommendation;
575 }
576
ComputeStepTimeSummaryInMs(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)577 StepSummary ComputeStepTimeSummaryInMs(
578 const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
579 Stat<double> total_step_stats_in_ms;
580 // iterates over each step.
581 for (const auto& coreid_stepinfo_map : grouped_by_step) {
582 double max_per_step_stats_in_ms = 0.0;
583 // iterates over each core.
584 for (const auto& coreid_and_stepinfo :
585 coreid_stepinfo_map.step_info_per_core()) {
586 const auto& step_info = coreid_and_stepinfo.second;
587 max_per_step_stats_in_ms = std::max(step_info.duration_ps() / kNumPsPerMs,
588 max_per_step_stats_in_ms);
589 }
590 // Step time of each step is determined by the slowest core.
591 total_step_stats_in_ms.UpdateStat(max_per_step_stats_in_ms);
592 }
593
594 return GetStepSummaryForSampleStats(total_step_stats_in_ms);
595 }
596
ConvertOpStatsToInputPipelineAnalysis(const OpStats & op_stats)597 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
598 const OpStats& op_stats) {
599 InputPipelineAnalysisResult result =
600 ComputeGenericInputPipelineAnalysisResult(
601 op_stats.step_db().step_sequence());
602 PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
603 result.set_hardware_type(HardwareType_Name(
604 ParseHardwareType(op_stats.run_environment().device_type())));
605 GenerateHostResult(op_stats.host_op_metrics_db(), &result);
606
607 InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
608 BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
609 result.input_time_breakdown(), result.step_details());
610 result.set_input_percent(bottleneck_analysis.input_percent());
611 result.set_output_percent(bottleneck_analysis.output_percent());
612 result.set_idle_percent(bottleneck_analysis.idle_percent());
613 result.set_compute_percent(bottleneck_analysis.compute_percent());
614
615 recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
616 *recommendation.mutable_summary_next_step() =
617 GetSummaryNextStep(bottleneck_analysis.input_classification(),
618 result.input_time_breakdown());
619
620 *result.mutable_recommendation() = recommendation;
621 return result;
622 }
623
InputAnalysis(double input_percent,double all_other_percent,std::string * input_classification,std::string * input_statement)624 bool InputAnalysis(double input_percent, double all_other_percent,
625 std::string* input_classification,
626 std::string* input_statement) {
627 absl::string_view non_input_time = "other time";
628 if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
629 *input_classification = "host";
630 *input_statement = absl::StrCat(
631 "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
632 "% of the total step time sampled is waiting for input. Therefore, you "
633 "should first focus on reducing the input time.");
634 return false;
635 } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
636 *input_classification = "both";
637 *input_statement = absl::StrCat(
638 "Your program is MODERATELY input-bound because ",
639 OneDigit(input_percent),
640 "% of the total step time sampled is waiting for input. Therefore, "
641 "you would need to reduce both the input time and ",
642 non_input_time, ".");
643 return false;
644 } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
645 // Input analysis says it is not input-bound, but "All-Other" time
646 // is significant. It could still be input-bound (or Python overhead).
647 *input_classification = "both";
648 *input_statement = absl::StrCat(
649 "Your program is POTENTIALLY input-bound because ",
650 OneDigit(all_other_percent),
651 "% of the total step time sampled is spent on 'All Others' time (which "
652 "could be due to I/O or Python execution or both).");
653 return true;
654 } else {
655 // Defintely not input-bound.
656 *input_classification = "device";
657 *input_statement =
658 absl::StrCat("Your program is NOT input-bound because only ",
659 OneDigit(input_percent),
660 "% of the total step time sampled is waiting for "
661 "input. Therefore, you should focus on "
662 "reducing ",
663 non_input_time, ".");
664 return false;
665 }
666 }
667
OutputAnalysis(double output_percent,std::string * output_classification,std::string * output_statement)668 void OutputAnalysis(double output_percent, std::string* output_classification,
669 std::string* output_statement) {
670 if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
671 *output_classification = "host";
672 *output_statement = absl::StrCat(
673 "Your program is HIGHLY output-bound because ",
674 OneDigit(output_percent),
675 "% of the total step time sampled is spent on output. Therefore, you "
676 "should first focus on reducing the output time.");
677 } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
678 *output_classification = "both";
679 *output_statement = absl::StrCat(
680 "Your program is MODERATELY output-bound because ",
681 OneDigit(output_percent),
682 "% of the total step time sampled is spent on output. Therefore, "
683 "you would need to reduce both the output time and other time.");
684 } else {
685 *output_classification = "device";
686 *output_statement = "";
687 }
688 }
689
ComputeBottleneckAnalysis(const InputTimeBreakdown & input_time_breakdown,const::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any> & any_step_details)690 BottleneckAnalysis ComputeBottleneckAnalysis(
691 const InputTimeBreakdown& input_time_breakdown,
692 const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
693 any_step_details) {
694 double total_step_time_ms = 0;
695 double total_input_ms = 0;
696 double total_output_ms = 0;
697 double total_host_compute_ms = 0;
698 double total_host_prepare_ms = 0;
699 double total_host_compile_ms = 0;
700 double total_device_compute_ms = 0;
701 double total_device_to_device_ms = 0;
702 double total_device_collectives_ms = 0;
703 double total_unknown_ms = 0;
704
705 for (const google::protobuf::Any& step_details : any_step_details) {
706 PerGenericStepDetails details;
707 bool success = step_details.UnpackTo(&details);
708 if (!success && !step_details.type_url().empty()) {
709 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
710 << std::endl;
711 return {};
712 }
713 total_step_time_ms += details.step_time_ms();
714 total_input_ms +=
715 details.host_wait_input_ms() + details.host_to_device_ms();
716 total_output_ms += details.output_ms();
717 total_host_prepare_ms += details.host_prepare_ms();
718 total_device_compute_ms += details.device_compute_ms();
719 total_device_to_device_ms += details.device_to_device_ms();
720 total_device_collectives_ms += details.device_collectives_ms();
721 total_host_compute_ms += details.host_compute_ms();
722 total_host_compile_ms += details.host_compile_ms();
723 total_unknown_ms += details.unknown_time_ms();
724 }
725
726 if (total_step_time_ms == 0) {
727 BottleneckAnalysis analysis;
728 analysis.set_input_classification("unknown");
729 analysis.set_input_statement(
730 "No step time measured. Therefore we cannot tell where the "
731 "performance bottleneck is.");
732 analysis.set_kernel_launch_classification("no");
733 analysis.set_kernel_launch_statement("");
734 analysis.set_all_other_classification("no");
735 analysis.set_all_other_statement("");
736 analysis.set_device_collectives_classification("no");
737 analysis.set_device_collectives_statement("");
738 return analysis;
739 }
740 double input_percent = 100.0 * total_input_ms / total_step_time_ms;
741 double output_percent = 100.0 * total_output_ms / total_step_time_ms;
742 double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
743 double device_collectives_percent =
744 100.0 * total_device_collectives_ms / total_step_time_ms;
745
746 // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
747 // host compute, host compile, and unknown.
748 double idle_percent =
749 std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
750 device_collectives_percent);
751 double kernel_launch_percent =
752 100.0 * total_host_prepare_ms / total_step_time_ms;
753 double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
754
755 std::string input_classification;
756 std::string input_statement;
757 bool all_other_reported =
758 InputAnalysis(input_percent, all_other_percent, &input_classification,
759 &input_statement);
760
761 std::string device_collectives_classification;
762 std::string device_collectives_statement;
763 DeviceCollectivesAnalysis(device_collectives_percent,
764 &device_collectives_classification,
765 &device_collectives_statement);
766
767 std::string kernel_launch_classification;
768 std::string kernel_launch_statement;
769 KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
770 &kernel_launch_classification, &kernel_launch_statement);
771
772 std::string all_other_classification;
773 std::string all_other_statement;
774 AllOtherAnalysis(all_other_reported, all_other_percent,
775 &all_other_classification, &all_other_statement);
776
777 BottleneckAnalysis analysis;
778 analysis.set_input_percent(input_percent);
779 analysis.set_output_percent(output_percent);
780 analysis.set_idle_percent(idle_percent);
781 analysis.set_compute_percent(compute_percent);
782
783 analysis.set_input_classification(input_classification);
784 analysis.set_input_statement(input_statement);
785 analysis.set_kernel_launch_classification(kernel_launch_classification);
786 analysis.set_kernel_launch_statement(kernel_launch_statement);
787 analysis.set_all_other_classification(all_other_classification);
788 analysis.set_all_other_statement(all_other_statement);
789 analysis.set_device_collectives_classification(
790 device_collectives_classification);
791 analysis.set_device_collectives_statement(device_collectives_statement);
792
793 return analysis;
794 }
795
GetSummaryNextStep(absl::string_view input_classification,const InputTimeBreakdown & breakdown)796 std::string GetSummaryNextStep(absl::string_view input_classification,
797 const InputTimeBreakdown& breakdown) {
798 std::string summary_next_step;
799 if (input_classification == "host" || input_classification == "both") {
800 if (!TfDataInUse(breakdown)) {
801 summary_next_step = absl::StrCat(
802 "Consider using ", MakeDocLink(DatasetIntroDoc(), "the tf.data API"),
803 " to enable profiler's host-side analysis for input pipeline. "
804 "Profiler currently does not support custom input pipeline (please "
805 "ignore "
806 "Section ",
807 kHostAnalysisSectionNumber, " below).");
808 } else {
809 summary_next_step =
810 absl::StrCat("Look at Section ", kHostAnalysisSectionNumber,
811 " for the breakdown of input time on the host.");
812 }
813 } else {
814 summary_next_step = "You may skip the rest of this page.";
815 }
816
817 return summary_next_step;
818 }
819
HostToDeviceTransferAsPercentOfInputTime(const InputTimeBreakdown & breakdown)820 double HostToDeviceTransferAsPercentOfInputTime(
821 const InputTimeBreakdown& breakdown) {
822 // Thanks to the scaling trick we did in GenerateHostResult(), we can
823 // estimate the percentage of input-time spent on host-to-device transfer in
824 // the following way.
825 double total_input_time_us =
826 breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
827 breakdown.preprocessing_us() + breakdown.enqueue_us() +
828 breakdown.unclassified_non_enqueue_us();
829 return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
830 }
831
832 } // namespace profiler
833 } // namespace tensorflow
834