1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
17 #include "absl/strings/str_cat.h"
18 #include "absl/strings/str_format.h"
19 #include "tensorflow/compiler/xla/metric_table_report.h"
20 #include "tensorflow/compiler/xla/util.h"
21 #include "tensorflow/core/lib/strings/numbers.h"
22
23 namespace xla {
24
25 using absl::StrAppend;
26 using absl::StrAppendFormat;
27 using absl::StrCat;
28 using absl::StrFormat;
29 using tensorflow::strings::HumanReadableElapsedTime;
30 using tensorflow::strings::HumanReadableNumBytes;
31
ToString() const32 string HumanReadableProfileBuilder::ToString() const {
33 string s;
34
35 StrAppendFormat(&s, "Execution profile for %s: (%s @ f_nom)\n",
36 computation_name_,
37 HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)));
38
39 int64 cumulative_cycles = 0;
40 auto print_op = [&](const OpInfo& op, bool is_total = false) {
41 // Skip ops with 0 optimal seconds and 0 actual cycles. These are ops that
42 // were expected to be free and are actually free -- things like (on most
43 // backends) kParameter or kConstant HLOs. There's no need to clutter the
44 // profile with these.
45 if (op.optimal_seconds == 0 && op.cycles == 0) {
46 return;
47 }
48
49 string bytes_per_sec;
50 string bytes_per_cycle;
51 if (op.cycles > 0 && op.bytes_accessed >= 0) {
52 bytes_per_sec = StrCat(
53 HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles)),
54 "/s");
55 double bpc = static_cast<double>(op.bytes_accessed) / op.cycles;
56 if (op.bytes_accessed > op.cycles) {
57 bytes_per_cycle = StrCat(HumanReadableNumBytes(bpc), "/cycle");
58 } else {
59 bytes_per_cycle = StrFormat("%.3fB/cycle", bpc);
60 }
61 }
62
63 double cumulative_cycles_percent = 0;
64 double cycles_percent = 0;
65 if (!is_total) {
66 cumulative_cycles += op.cycles;
67 }
68 if (total_cycles_ > 0) {
69 cycles_percent = op.cycles / static_cast<double>(total_cycles_) * 100;
70 cumulative_cycles_percent =
71 cumulative_cycles / static_cast<double>(total_cycles_) * 100;
72 }
73
74 string cycles_percent_str;
75 if (is_total) {
76 // Leaving off the two trailing decimal points of "100.%" lets us save two
77 // columns in the output.
78 cycles_percent_str = "100.% 100Σ";
79 } else {
80 cycles_percent_str = StrFormat("%5.2f%% %2.0fΣ", cycles_percent,
81 cumulative_cycles_percent);
82 }
83
84 double nsecs = op.cycles / clock_rate_ghz_;
85 StrAppendFormat(
86 &s,
87 "%15d cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: "
88 "%16s :: %s\n",
89 op.cycles, cycles_percent_str, CyclesToMicroseconds(op.cycles),
90 op.optimal_seconds < 0
91 ? ""
92 : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6),
93 op.flop_count > 0 && nsecs > 0
94 ? HumanReadableNumFlops(op.flop_count, nsecs)
95 : "",
96 op.transcendental_count > 0 && nsecs > 0
97 ? HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs)
98 : "",
99 bytes_per_sec, bytes_per_cycle, op.name);
100 };
101
102 double optimal_seconds_sum = 0;
103 int64 total_flops = 0.;
104 int64 total_transcendentals = 0.;
105 int64 total_bytes = 0;
106 for (const auto& op : op_infos_) {
107 if (op.optimal_seconds > 0) {
108 // An op can run faster than the estimated optimum. For example, we might
109 // estimate a fusion's speed by looking at the size of its operands and
110 // result, but perhaps the fusion doesn't read the entirety of all of its
111 // inputs. For the purposes of summing the instructions' optimal speeds,
112 // we treat the "optimum" as the smallest of either the estimated optimum
113 // and the actual speed.
114 optimal_seconds_sum +=
115 std::min(double{op.optimal_seconds}, CyclesToSeconds(op.cycles));
116 }
117 total_flops += std::max(op.flop_count, int64{0});
118 total_transcendentals += std::max(op.transcendental_count, int64{0});
119 total_bytes += std::max(op.bytes_accessed, int64{0});
120 }
121
122 VLOG(1) << "Total floating point ops: " << total_flops;
123
124 print_op({is_entry_computation_ ? "[total] [entry]" : "[total]", "[total]",
125 /*category=*/"", total_cycles_, total_flops, total_transcendentals,
126 total_bytes, static_cast<float>(optimal_seconds_sum)},
127 /*is_total=*/true);
128
129 // Sort ops in decreasing order of cycles, and print them.
130 std::vector<OpInfo> sorted_ops(op_infos_);
131 absl::c_sort(sorted_ops, [](const OpInfo& a, const OpInfo& b) {
132 return a.cycles > b.cycles;
133 });
134 for (const auto& op : sorted_ops) {
135 print_op(op);
136 }
137
138 if (total_cycles_ <= 0) {
139 StrAppend(&s, "****** 0 total cycles ******\n");
140 } else {
141 // Only show an optimal discrepancy table if at least one value was
142 // specified. Estimates are non-negative, so if the sum is greater than
143 // zero, then at least one summand was greater than zero.
144 if (optimal_seconds_sum > 0) {
145 MetricTableReport table;
146 table.SetMetricName("microseconds above estimated optimum");
147 table.SetEntryName("ops");
148 table.SetShowCategoryTable();
149 table.SetShowAllEntries();
150 float total_discrepancy_in_microseconds = 0.0f;
151 for (const auto& op : op_infos_) {
152 // Skip ops with < 0 optimal seconds. These are ops for which we don't
153 // know the optimal time.
154 if (op.optimal_seconds < 0) {
155 continue;
156 }
157 // Also skip ops with 0 actual cycles. These ops were free; there's no
158 // need to clutter the "above estimated optimum" table with them,
159 // because they can't be optimized further.
160 if (op.cycles == 0) {
161 continue;
162 }
163 MetricTableReport::Entry entry;
164 entry.text = op.name;
165 entry.short_text = op.short_name;
166 entry.category_text = op.category;
167 // Ignore ops that run faster than the estimated optimal here, as we do
168 // when calculating optimal_seconds_sum.
169 entry.metric = std::max(
170 0., CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6);
171 total_discrepancy_in_microseconds += entry.metric;
172 table.AddEntry(std::move(entry));
173 }
174 StrAppend(&s, table.MakeReport(total_discrepancy_in_microseconds));
175 }
176
177 {
178 MetricTableReport table;
179 table.SetMetricName("microseconds");
180 table.SetEntryName("ops");
181 table.SetShowCategoryTable();
182 table.SetShowAllEntries();
183 for (const auto& op : op_infos_) {
184 // Skip ops with 0 optimal seconds and 0 actual cycles. As in
185 // print_op(), these are uninteresting because they're expected to be
186 // free, and they were actually free.
187 if (op.cycles == 0 && op.optimal_seconds == 0) {
188 continue;
189 }
190 MetricTableReport::Entry entry;
191 entry.text = op.name;
192 entry.short_text = op.short_name;
193 entry.category_text = op.category;
194 entry.metric = CyclesToMicroseconds(op.cycles);
195 table.AddEntry(std::move(entry));
196 }
197 StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
198 }
199 }
200
201 if (total_bytes > 0) {
202 MetricTableReport table;
203 table.SetMetricName("MiB read+written");
204 table.SetEntryName("ops");
205 table.SetShowCategoryTable();
206 for (const auto& op : op_infos_) {
207 MetricTableReport::Entry entry;
208 entry.text = op.name;
209 entry.short_text = op.short_name;
210 entry.category_text = op.category;
211 entry.metric = static_cast<double>(op.bytes_accessed) / (1 << 20);
212 table.AddEntry(std::move(entry));
213 }
214 StrAppend(&s,
215 table.MakeReport(static_cast<double>(total_bytes) / (1 << 20)));
216 }
217 return s;
218 }
219
220 } // namespace xla
221