1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
17
18 #include <algorithm>
19 #include <string>
20 #include <tuple>
21 #include <vector>
22
23 #include "absl/algorithm/container.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/numbers.h"
26 #include "absl/strings/str_split.h"
27 #include "absl/strings/string_view.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/types.h"
30 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
31
32 namespace tensorflow {
33 namespace profiler {
34
35 namespace {
36
37 // The maximum number of Kernels displayed on Kernel Stats page.
38 const int kMaxNumOfKernels = 1000;
39
40 } // namespace
41
ParseKernelLaunchParams(absl::string_view xstat_kernel_details,KernelReport * kernel)42 void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
43 KernelReport* kernel) {
44 const std::vector<absl::string_view> params =
45 absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(" \n"));
46
47 constexpr uint32 kNumDimensions = 3;
48 for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
49 kernel->add_block_dim(1);
50 kernel->add_grid_dim(1);
51 }
52
53 // Process tokens.
54 for (const auto& param : params) {
55 const std::vector<absl::string_view> key_value = absl::StrSplit(param, ':');
56 if (key_value.size() != 2) {
57 // Unrecognized token.
58 continue;
59 }
60 absl::string_view key = key_value[0];
61 absl::string_view value_str = key_value[1];
62 uint32 value = 0;
63 double pct = 0.0;
64 // Cases that consume a pair of tokens "key:value".
65 if (key == "regs" && absl::SimpleAtoi(value_str, &value)) {
66 kernel->set_registers_per_thread(value);
67 } else if (key == "static_shared" && absl::SimpleAtoi(value_str, &value)) {
68 kernel->set_static_shmem_bytes(value);
69 } else if (key == "dynamic_shared" && absl::SimpleAtoi(value_str, &value)) {
70 kernel->set_dynamic_shmem_bytes(value);
71 } else if (key == "block") {
72 const std::vector<absl::string_view>& block =
73 absl::StrSplit(value_str, ',');
74 uint32 tmp[3];
75 if (block.size() == 3 && absl::SimpleAtoi(block[0], &tmp[0]) &&
76 absl::SimpleAtoi(block[1], &tmp[1]) &&
77 absl::SimpleAtoi(block[2], &tmp[2])) {
78 std::copy_n(tmp, 3, kernel->mutable_block_dim()->begin());
79 }
80 } else if (key == "grid") {
81 const std::vector<absl::string_view>& grid =
82 absl::StrSplit(value_str, ',');
83 uint32 tmp[3];
84 if (grid.size() == 3 && absl::SimpleAtoi(grid[0], &tmp[0]) &&
85 absl::SimpleAtoi(grid[1], &tmp[1]) &&
86 absl::SimpleAtoi(grid[2], &tmp[2])) {
87 std::copy_n(tmp, 3, kernel->mutable_grid_dim()->begin());
88 }
89 } else if (key == "occ_pct" && absl::SimpleAtod(value_str, &pct)) {
90 kernel->set_occupancy_pct(pct);
91 }
92 }
93 }
94
IsKernelUsingTensorCore(absl::string_view kernel_name)95 bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
96 // Some examples: volta_h884gemm, volta_fp16_s884gemm,
97 // turing_fp16_s1688cudnn_fp16
98 bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
99 absl::StrContains(kernel_name, "1688") ||
100 absl::StrContains(kernel_name, "hmma") ||
101 absl::StrContains(kernel_name, "xmma");
102 if (possible_tensor_kernel) {
103 VLOG(3) << "Possible tensor kernel: " << kernel_name;
104 }
105
106 return (absl::StartsWith(kernel_name, "volta_i884") ||
107 absl::StartsWith(kernel_name, "volta_h884") ||
108 absl::StartsWith(kernel_name, "volta_s884") ||
109 absl::StartsWith(kernel_name, "volta_fp16_i884") ||
110 absl::StartsWith(kernel_name, "volta_fp16_h884") ||
111 absl::StartsWith(kernel_name, "volta_fp16_s884") ||
112 absl::StartsWith(kernel_name, "turing_i1688") ||
113 absl::StartsWith(kernel_name, "turing_h1688") ||
114 absl::StartsWith(kernel_name, "turing_s1688") ||
115 absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
116 absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
117 absl::StartsWith(kernel_name, "turing_fp16_s1688") ||
118 absl::StrContains(kernel_name, "hmma") ||
119 absl::StrContains(kernel_name, "xmma"));
120 }
121
122 // This list is not exhaustive.
IsOpTensorCoreEligible(absl::string_view tf_op_name)123 bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
124 // Disable formatting to keep inline comments vertically aligned.
125 // clang-format off
126 return false
127 // Using EndsWith to match Fused operations.
128 || absl::EndsWith(tf_op_name, "Conv2D")
129 || absl::EndsWith(tf_op_name, "Conv2DBackpropFilter")
130 || absl::EndsWith(tf_op_name, "Conv2DBackpropInput")
131 || absl::EndsWith(tf_op_name, "Conv3D")
132 || absl::EndsWith(tf_op_name, "DepthwiseConv2dNative")
133 || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropFilter")
134 || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropInput")
135 // Using Contains to match V2/V3 suffixes.
136 || absl::StrContains(tf_op_name, "BatchMatMul")
137 // MatMul requires exact matching.
138 || absl::EndsWith(tf_op_name, "/MatMul")
139 || absl::EndsWith(tf_op_name, "FusedMatMul")
140 // cuDNN operations.
141 || absl::EndsWith(tf_op_name, "/CudnnRNN")
142 || absl::StrContains(tf_op_name, "CudnnRNNV")
143 || absl::StrContains(tf_op_name, "CudnnRNNForward")
144 || absl::StrContains(tf_op_name, "CudnnRNNBackprop")
145 // Special cases.
146 || absl::EndsWith(tf_op_name, "XlaDot")
147 || absl::EndsWith(tf_op_name, "XlaDotV2");
148 // clang-format on
149 }
150
IsEinsumTensorCoreEligible(absl::string_view equation)151 bool IsEinsumTensorCoreEligible(absl::string_view equation) {
152 if (equation.empty()) {
153 return false;
154 }
155 const std::vector<absl::string_view> input_output =
156 absl::StrSplit(equation, "->");
157 if (input_output.size() != 2) {
158 return false;
159 }
160 const std::vector<absl::string_view> lhs_rhs =
161 absl::StrSplit(input_output[0], ',');
162 return lhs_rhs.size() == 2;
163 }
164
operator ()(const KernelReport & lhs,const KernelReport & rhs) const165 bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
166 const KernelReport& rhs) const {
167 // Disable formatting to keep vertical alignment for better readability,
168 // and make it easier to reorder columns.
169 // clang-format off
170 auto lhs_tuple = std::make_tuple(
171 lhs.name(),
172 lhs.grid_dim(0),
173 lhs.grid_dim(1),
174 lhs.grid_dim(2),
175 lhs.block_dim(0),
176 lhs.block_dim(1),
177 lhs.block_dim(2),
178 lhs.registers_per_thread(),
179 lhs.static_shmem_bytes(),
180 lhs.dynamic_shmem_bytes(),
181 lhs.is_kernel_using_tensor_core(),
182 lhs.is_op_tensor_core_eligible(),
183 lhs.op_name());
184
185 auto rhs_tuple = std::make_tuple(
186 rhs.name(),
187 rhs.grid_dim(0),
188 rhs.grid_dim(1),
189 rhs.grid_dim(2),
190 rhs.block_dim(0),
191 rhs.block_dim(1),
192 rhs.block_dim(2),
193 rhs.registers_per_thread(),
194 rhs.static_shmem_bytes(),
195 rhs.dynamic_shmem_bytes(),
196 rhs.is_kernel_using_tensor_core(),
197 rhs.is_op_tensor_core_eligible(),
198 rhs.op_name());
199 // clang-format on
200 return lhs_tuple < rhs_tuple;
201 }
202
operator ()(const KernelReport & lhs,const KernelReport & rhs) const203 bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
204 const KernelReport& rhs) const {
205 // Disable formatting to keep vertical alignment for better readability,
206 // and make it easier to reorder columns.
207 // clang-format off
208 // Put the most expensive string comparisons last.
209 return (
210 lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
211 lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
212 lhs.block_dim(0) == rhs.block_dim(0) &&
213 lhs.block_dim(1) == rhs.block_dim(1) &&
214 lhs.block_dim(2) == rhs.block_dim(2) &&
215 lhs.grid_dim(0) == rhs.grid_dim(0) &&
216 lhs.grid_dim(1) == rhs.grid_dim(1) &&
217 lhs.grid_dim(2) == rhs.grid_dim(2) &&
218 lhs.registers_per_thread() == rhs.registers_per_thread() &&
219 lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
220 lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
221 lhs.name() == rhs.name() &&
222 lhs.op_name() == rhs.op_name());
223 // clang-format on
224 }
225
SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb * kernel_stats_db)226 void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db) {
227 auto comp = [](const KernelReport& lhs, const KernelReport& rhs) {
228 return lhs.total_duration_ns() > rhs.total_duration_ns() ||
229 (lhs.total_duration_ns() == rhs.total_duration_ns() &&
230 KernelReportLessThanComparator()(lhs, rhs));
231 };
232
233 // Sort and keep at most <kMaxNumOfKernels> kernel reports.
234 if (kernel_stats_db->reports_size() > kMaxNumOfKernels) {
235 std::partial_sort(
236 kernel_stats_db->mutable_reports()->begin(),
237 kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
238 kernel_stats_db->mutable_reports()->end(), comp);
239 kernel_stats_db->mutable_reports()->erase(
240 kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
241 kernel_stats_db->mutable_reports()->end());
242 } else {
243 std::sort(kernel_stats_db->mutable_reports()->begin(),
244 kernel_stats_db->mutable_reports()->end(), comp);
245 }
246 }
247
CopyTopKDurationKernelReportsToDb(const KernelReportMap & reports,KernelStatsDb * dst)248 void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
249 KernelStatsDb* dst) {
250 std::vector<std::pair<const KernelReport*, const KernelReportValue*>>
251 kernels_to_sort;
252 kernels_to_sort.reserve(reports.size());
253 for (const auto& report_value : reports) {
254 kernels_to_sort.push_back(
255 std::make_pair(&report_value.first, &report_value.second));
256 }
257
258 auto comp =
259 [](const std::pair<const KernelReport*, const KernelReportValue*>& lhs,
260 const std::pair<const KernelReport*, const KernelReportValue*>& rhs) {
261 return lhs.second->total_duration_ns > rhs.second->total_duration_ns ||
262 (lhs.second->total_duration_ns ==
263 rhs.second->total_duration_ns &&
264 KernelReportLessThanComparator()(*lhs.first, *rhs.first));
265 };
266
267 // Sort and copy at most <kMaxNumOfKernels> kernels to <dst>.
268 if (kernels_to_sort.size() > kMaxNumOfKernels) {
269 absl::c_partial_sort(kernels_to_sort,
270 kernels_to_sort.begin() + kMaxNumOfKernels, comp);
271 } else {
272 absl::c_sort(kernels_to_sort, comp);
273 }
274
275 int copy_size =
276 std::min(kMaxNumOfKernels, static_cast<int>(kernels_to_sort.size()));
277 for (int i = 0; i < copy_size; i++) {
278 KernelReport* report = dst->add_reports();
279 *report = *kernels_to_sort[i].first;
280 const KernelReportValue& kernel_value = *kernels_to_sort[i].second;
281 // Set value using KernelReportValue.
282 report->set_occurrences(kernel_value.occurrences);
283 report->set_min_duration_ns(kernel_value.min_duration_ns);
284 report->set_max_duration_ns(kernel_value.max_duration_ns);
285 report->set_total_duration_ns(kernel_value.total_duration_ns);
286 }
287 }
288
InsertOrUpdateKernelReport(const KernelReport & kernel,const KernelReportValue & value,KernelReportMap * dst)289 void InsertOrUpdateKernelReport(const KernelReport& kernel,
290 const KernelReportValue& value,
291 KernelReportMap* dst) {
292 KernelReportValue& element = (*dst)[kernel];
293 if (element.occurrences == 0) {
294 element = value;
295 } else {
296 element.total_duration_ns += value.total_duration_ns;
297 element.min_duration_ns =
298 std::min(element.min_duration_ns, value.min_duration_ns);
299 element.max_duration_ns =
300 std::max(element.max_duration_ns, value.max_duration_ns);
301 element.occurrences += 1;
302 }
303 }
304
MergeKernelReports(const KernelReportMap & reports,KernelReportMap * dst)305 void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst) {
306 for (auto& kernel_value : reports) {
307 InsertOrUpdateKernelReport(kernel_value.first, kernel_value.second, dst);
308 }
309 }
310
GroupKernelReportsByOpName(const KernelStatsDb & kernel_stats_db)311 KernelStatsByOpName GroupKernelReportsByOpName(
312 const KernelStatsDb& kernel_stats_db) {
313 KernelStatsByOpName op_level_kernel_stats;
314 for (const KernelReport& kernel_report : kernel_stats_db.reports()) {
315 auto ret = op_level_kernel_stats.emplace(kernel_report.op_name(),
316 OpLevelKernelStats());
317 if (ret.second) {
318 // Inserted. Add a new op in <op_level_kernel_stats>.
319 OpLevelKernelStats& stats = ret.first->second;
320 stats.is_op_tensor_core_eligible =
321 kernel_report.is_op_tensor_core_eligible();
322 stats.total_duration_ns += kernel_report.total_duration_ns();
323 if (kernel_report.is_kernel_using_tensor_core()) {
324 stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
325 }
326 } else {
327 // Not inserted. Aggregate kernel stats to op level.
328 OpLevelKernelStats& stats = ret.first->second;
329 // Verifies operations with the same name have the same TensorCore
330 // eligibility.
331 DCHECK_EQ(stats.is_op_tensor_core_eligible,
332 kernel_report.is_op_tensor_core_eligible());
333 stats.total_duration_ns += kernel_report.total_duration_ns();
334 if (kernel_report.is_kernel_using_tensor_core()) {
335 stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
336 }
337 }
338 }
339 return op_level_kernel_stats;
340 }
341
342 } // namespace profiler
343 } // namespace tensorflow
344