• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
17 
18 #include <algorithm>
19 #include <string>
20 #include <tuple>
21 #include <vector>
22 
23 #include "absl/algorithm/container.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/numbers.h"
26 #include "absl/strings/str_split.h"
27 #include "absl/strings/string_view.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/types.h"
30 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
31 
32 namespace tensorflow {
33 namespace profiler {
34 
35 namespace {
36 
37 // The maximum number of Kernels displayed on Kernel Stats page.
38 const int kMaxNumOfKernels = 1000;
39 
40 }  // namespace
41 
ParseKernelLaunchParams(absl::string_view xstat_kernel_details,KernelReport * kernel)42 void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
43                              KernelReport* kernel) {
44   const std::vector<absl::string_view> params =
45       absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(" \n"));
46 
47   constexpr uint32 kNumDimensions = 3;
48   for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
49     kernel->add_block_dim(1);
50     kernel->add_grid_dim(1);
51   }
52 
53   // Process tokens.
54   for (const auto& param : params) {
55     const std::vector<absl::string_view> key_value = absl::StrSplit(param, ':');
56     if (key_value.size() != 2) {
57       // Unrecognized token.
58       continue;
59     }
60     absl::string_view key = key_value[0];
61     absl::string_view value_str = key_value[1];
62     uint32 value = 0;
63     double pct = 0.0;
64     // Cases that consume a pair of tokens "key:value".
65     if (key == "regs" && absl::SimpleAtoi(value_str, &value)) {
66       kernel->set_registers_per_thread(value);
67     } else if (key == "static_shared" && absl::SimpleAtoi(value_str, &value)) {
68       kernel->set_static_shmem_bytes(value);
69     } else if (key == "dynamic_shared" && absl::SimpleAtoi(value_str, &value)) {
70       kernel->set_dynamic_shmem_bytes(value);
71     } else if (key == "block") {
72       const std::vector<absl::string_view>& block =
73           absl::StrSplit(value_str, ',');
74       uint32 tmp[3];
75       if (block.size() == 3 && absl::SimpleAtoi(block[0], &tmp[0]) &&
76           absl::SimpleAtoi(block[1], &tmp[1]) &&
77           absl::SimpleAtoi(block[2], &tmp[2])) {
78         std::copy_n(tmp, 3, kernel->mutable_block_dim()->begin());
79       }
80     } else if (key == "grid") {
81       const std::vector<absl::string_view>& grid =
82           absl::StrSplit(value_str, ',');
83       uint32 tmp[3];
84       if (grid.size() == 3 && absl::SimpleAtoi(grid[0], &tmp[0]) &&
85           absl::SimpleAtoi(grid[1], &tmp[1]) &&
86           absl::SimpleAtoi(grid[2], &tmp[2])) {
87         std::copy_n(tmp, 3, kernel->mutable_grid_dim()->begin());
88       }
89     } else if (key == "occ_pct" && absl::SimpleAtod(value_str, &pct)) {
90       kernel->set_occupancy_pct(pct);
91     }
92   }
93 }
94 
IsKernelUsingTensorCore(absl::string_view kernel_name)95 bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
96   // Some examples: volta_h884gemm, volta_fp16_s884gemm,
97   // turing_fp16_s1688cudnn_fp16
98   bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
99                                 absl::StrContains(kernel_name, "1688") ||
100                                 absl::StrContains(kernel_name, "hmma") ||
101                                 absl::StrContains(kernel_name, "xmma");
102   if (possible_tensor_kernel) {
103     VLOG(3) << "Possible tensor kernel: " << kernel_name;
104   }
105 
106   return (absl::StartsWith(kernel_name, "volta_i884") ||
107           absl::StartsWith(kernel_name, "volta_h884") ||
108           absl::StartsWith(kernel_name, "volta_s884") ||
109           absl::StartsWith(kernel_name, "volta_fp16_i884") ||
110           absl::StartsWith(kernel_name, "volta_fp16_h884") ||
111           absl::StartsWith(kernel_name, "volta_fp16_s884") ||
112           absl::StartsWith(kernel_name, "turing_i1688") ||
113           absl::StartsWith(kernel_name, "turing_h1688") ||
114           absl::StartsWith(kernel_name, "turing_s1688") ||
115           absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
116           absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
117           absl::StartsWith(kernel_name, "turing_fp16_s1688") ||
118           absl::StrContains(kernel_name, "hmma") ||
119           absl::StrContains(kernel_name, "xmma"));
120 }
121 
122 // This list is not exhaustive.
IsOpTensorCoreEligible(absl::string_view tf_op_name)123 bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
124   // Disable formatting to keep inline comments vertically aligned.
125   // clang-format off
126   return false
127       // Using EndsWith to match Fused operations.
128       || absl::EndsWith(tf_op_name, "Conv2D")
129       || absl::EndsWith(tf_op_name, "Conv2DBackpropFilter")
130       || absl::EndsWith(tf_op_name, "Conv2DBackpropInput")
131       || absl::EndsWith(tf_op_name, "Conv3D")
132       || absl::EndsWith(tf_op_name, "DepthwiseConv2dNative")
133       || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropFilter")
134       || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropInput")
135       // Using Contains to match V2/V3 suffixes.
136       || absl::StrContains(tf_op_name, "BatchMatMul")
137       // MatMul requires exact matching.
138       || absl::EndsWith(tf_op_name, "/MatMul")
139       || absl::EndsWith(tf_op_name, "FusedMatMul")
140       // cuDNN operations.
141       || absl::EndsWith(tf_op_name, "/CudnnRNN")
142       || absl::StrContains(tf_op_name, "CudnnRNNV")
143       || absl::StrContains(tf_op_name, "CudnnRNNForward")
144       || absl::StrContains(tf_op_name, "CudnnRNNBackprop")
145       // Special cases.
146       || absl::EndsWith(tf_op_name, "XlaDot")
147       || absl::EndsWith(tf_op_name, "XlaDotV2");
148   // clang-format on
149 }
150 
IsEinsumTensorCoreEligible(absl::string_view equation)151 bool IsEinsumTensorCoreEligible(absl::string_view equation) {
152   if (equation.empty()) {
153     return false;
154   }
155   const std::vector<absl::string_view> input_output =
156       absl::StrSplit(equation, "->");
157   if (input_output.size() != 2) {
158     return false;
159   }
160   const std::vector<absl::string_view> lhs_rhs =
161       absl::StrSplit(input_output[0], ',');
162   return lhs_rhs.size() == 2;
163 }
164 
operator ()(const KernelReport & lhs,const KernelReport & rhs) const165 bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
166                                                 const KernelReport& rhs) const {
167   // Disable formatting to keep vertical alignment for better readability,
168   // and make it easier to reorder columns.
169   // clang-format off
170   auto lhs_tuple = std::make_tuple(
171       lhs.name(),
172       lhs.grid_dim(0),
173       lhs.grid_dim(1),
174       lhs.grid_dim(2),
175       lhs.block_dim(0),
176       lhs.block_dim(1),
177       lhs.block_dim(2),
178       lhs.registers_per_thread(),
179       lhs.static_shmem_bytes(),
180       lhs.dynamic_shmem_bytes(),
181       lhs.is_kernel_using_tensor_core(),
182       lhs.is_op_tensor_core_eligible(),
183       lhs.op_name());
184 
185   auto rhs_tuple = std::make_tuple(
186       rhs.name(),
187       rhs.grid_dim(0),
188       rhs.grid_dim(1),
189       rhs.grid_dim(2),
190       rhs.block_dim(0),
191       rhs.block_dim(1),
192       rhs.block_dim(2),
193       rhs.registers_per_thread(),
194       rhs.static_shmem_bytes(),
195       rhs.dynamic_shmem_bytes(),
196       rhs.is_kernel_using_tensor_core(),
197       rhs.is_op_tensor_core_eligible(),
198       rhs.op_name());
199   // clang-format on
200   return lhs_tuple < rhs_tuple;
201 }
202 
operator ()(const KernelReport & lhs,const KernelReport & rhs) const203 bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
204                                                const KernelReport& rhs) const {
205   // Disable formatting to keep vertical alignment for better readability,
206   // and make it easier to reorder columns.
207   // clang-format off
208   // Put the most expensive string comparisons last.
209   return (
210       lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
211       lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
212       lhs.block_dim(0) == rhs.block_dim(0) &&
213       lhs.block_dim(1) == rhs.block_dim(1) &&
214       lhs.block_dim(2) == rhs.block_dim(2) &&
215       lhs.grid_dim(0) == rhs.grid_dim(0) &&
216       lhs.grid_dim(1) == rhs.grid_dim(1) &&
217       lhs.grid_dim(2) == rhs.grid_dim(2) &&
218       lhs.registers_per_thread() == rhs.registers_per_thread() &&
219       lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
220       lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
221       lhs.name() == rhs.name() &&
222       lhs.op_name() == rhs.op_name());
223   // clang-format on
224 }
225 
SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb * kernel_stats_db)226 void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db) {
227   auto comp = [](const KernelReport& lhs, const KernelReport& rhs) {
228     return lhs.total_duration_ns() > rhs.total_duration_ns() ||
229            (lhs.total_duration_ns() == rhs.total_duration_ns() &&
230             KernelReportLessThanComparator()(lhs, rhs));
231   };
232 
233   // Sort and keep at most <kMaxNumOfKernels> kernel reports.
234   if (kernel_stats_db->reports_size() > kMaxNumOfKernels) {
235     std::partial_sort(
236         kernel_stats_db->mutable_reports()->begin(),
237         kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
238         kernel_stats_db->mutable_reports()->end(), comp);
239     kernel_stats_db->mutable_reports()->erase(
240         kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
241         kernel_stats_db->mutable_reports()->end());
242   } else {
243     std::sort(kernel_stats_db->mutable_reports()->begin(),
244               kernel_stats_db->mutable_reports()->end(), comp);
245   }
246 }
247 
CopyTopKDurationKernelReportsToDb(const KernelReportMap & reports,KernelStatsDb * dst)248 void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
249                                        KernelStatsDb* dst) {
250   std::vector<std::pair<const KernelReport*, const KernelReportValue*>>
251       kernels_to_sort;
252   kernels_to_sort.reserve(reports.size());
253   for (const auto& report_value : reports) {
254     kernels_to_sort.push_back(
255         std::make_pair(&report_value.first, &report_value.second));
256   }
257 
258   auto comp =
259       [](const std::pair<const KernelReport*, const KernelReportValue*>& lhs,
260          const std::pair<const KernelReport*, const KernelReportValue*>& rhs) {
261         return lhs.second->total_duration_ns > rhs.second->total_duration_ns ||
262                (lhs.second->total_duration_ns ==
263                     rhs.second->total_duration_ns &&
264                 KernelReportLessThanComparator()(*lhs.first, *rhs.first));
265       };
266 
267   // Sort and copy at most <kMaxNumOfKernels> kernels to <dst>.
268   if (kernels_to_sort.size() > kMaxNumOfKernels) {
269     absl::c_partial_sort(kernels_to_sort,
270                          kernels_to_sort.begin() + kMaxNumOfKernels, comp);
271   } else {
272     absl::c_sort(kernels_to_sort, comp);
273   }
274 
275   int copy_size =
276       std::min(kMaxNumOfKernels, static_cast<int>(kernels_to_sort.size()));
277   for (int i = 0; i < copy_size; i++) {
278     KernelReport* report = dst->add_reports();
279     *report = *kernels_to_sort[i].first;
280     const KernelReportValue& kernel_value = *kernels_to_sort[i].second;
281     // Set value using KernelReportValue.
282     report->set_occurrences(kernel_value.occurrences);
283     report->set_min_duration_ns(kernel_value.min_duration_ns);
284     report->set_max_duration_ns(kernel_value.max_duration_ns);
285     report->set_total_duration_ns(kernel_value.total_duration_ns);
286   }
287 }
288 
InsertOrUpdateKernelReport(const KernelReport & kernel,const KernelReportValue & value,KernelReportMap * dst)289 void InsertOrUpdateKernelReport(const KernelReport& kernel,
290                                 const KernelReportValue& value,
291                                 KernelReportMap* dst) {
292   KernelReportValue& element = (*dst)[kernel];
293   if (element.occurrences == 0) {
294     element = value;
295   } else {
296     element.total_duration_ns += value.total_duration_ns;
297     element.min_duration_ns =
298         std::min(element.min_duration_ns, value.min_duration_ns);
299     element.max_duration_ns =
300         std::max(element.max_duration_ns, value.max_duration_ns);
301     element.occurrences += 1;
302   }
303 }
304 
MergeKernelReports(const KernelReportMap & reports,KernelReportMap * dst)305 void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst) {
306   for (auto& kernel_value : reports) {
307     InsertOrUpdateKernelReport(kernel_value.first, kernel_value.second, dst);
308   }
309 }
310 
GroupKernelReportsByOpName(const KernelStatsDb & kernel_stats_db)311 KernelStatsByOpName GroupKernelReportsByOpName(
312     const KernelStatsDb& kernel_stats_db) {
313   KernelStatsByOpName op_level_kernel_stats;
314   for (const KernelReport& kernel_report : kernel_stats_db.reports()) {
315     auto ret = op_level_kernel_stats.emplace(kernel_report.op_name(),
316                                              OpLevelKernelStats());
317     if (ret.second) {
318       // Inserted. Add a new op in <op_level_kernel_stats>.
319       OpLevelKernelStats& stats = ret.first->second;
320       stats.is_op_tensor_core_eligible =
321           kernel_report.is_op_tensor_core_eligible();
322       stats.total_duration_ns += kernel_report.total_duration_ns();
323       if (kernel_report.is_kernel_using_tensor_core()) {
324         stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
325       }
326     } else {
327       // Not inserted. Aggregate kernel stats to op level.
328       OpLevelKernelStats& stats = ret.first->second;
329       // Verifies operations with the same name have the same TensorCore
330       // eligibility.
331       DCHECK_EQ(stats.is_op_tensor_core_eligible,
332                 kernel_report.is_op_tensor_core_eligible());
333       stats.total_duration_ns += kernel_report.total_duration_ns();
334       if (kernel_report.is_kernel_using_tensor_core()) {
335         stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
336       }
337     }
338   }
339   return op_level_kernel_stats;
340 }
341 
342 }  // namespace profiler
343 }  // namespace tensorflow
344