1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/util.h"
17
18 #include <stdarg.h>
19
20 #include <cmath>
21 #include <limits>
22 #include <numeric>
23
24 #include "absl/algorithm/container.h"
25 #include "absl/container/flat_hash_map.h"
26 #include "absl/container/inlined_vector.h"
27 #include "absl/strings/match.h"
28 #include "absl/strings/str_cat.h"
29 #include "absl/strings/str_format.h"
30 #include "absl/strings/str_join.h"
31 #include "absl/strings/str_split.h"
32 #include "absl/types/optional.h"
33 #include "tensorflow/compiler/xla/types.h"
34 #include "tensorflow/core/lib/core/errors.h"
35 #include "tensorflow/core/lib/math/math_util.h"
36 #include "tensorflow/core/lib/strings/numbers.h"
37 #include "tensorflow/core/platform/bfloat16.h"
38 #include "tensorflow/core/platform/env.h"
39 #include "tensorflow/core/platform/mutex.h"
40 #include "tensorflow/core/platform/numbers.h"
41 #include "tensorflow/core/platform/stacktrace.h"
42
43 namespace xla {
44
WithLogBacktrace(const Status & status)45 Status WithLogBacktrace(const Status& status) {
46 CHECK(!status.ok());
47 VLOG(1) << status.ToString();
48 VLOG(2) << tensorflow::CurrentStackTrace();
49 return status;
50 }
51
ScopedLoggingTimer(const std::string & label,bool enabled,const char * file,int line,TimerStats * timer_stats)52 ScopedLoggingTimer::ScopedLoggingTimer(const std::string& label, bool enabled,
53 const char* file, int line,
54 TimerStats* timer_stats)
55 : enabled_(enabled),
56 file_(file),
57 line_(line),
58 label_(label),
59 timer_stats_(timer_stats) {
60 if (enabled_) {
61 start_micros_ = tensorflow::Env::Default()->NowMicros();
62 }
63 }
64
StopAndLog()65 void ScopedLoggingTimer::StopAndLog() {
66 if (enabled_) {
67 uint64 end_micros = tensorflow::Env::Default()->NowMicros();
68 double secs = (end_micros - start_micros_) / 1000000.0;
69
70 TimerStats& stats = *timer_stats_;
71 tensorflow::mutex_lock lock(stats.stats_mutex);
72 stats.cumulative_secs += secs;
73 if (secs > stats.max_secs) {
74 stats.max_secs = secs;
75 }
76 stats.times_called++;
77
78 LOG(INFO).AtLocation(file_, line_)
79 << label_
80 << " time: " << tensorflow::strings::HumanReadableElapsedTime(secs)
81 << " (cumulative: "
82 << tensorflow::strings::HumanReadableElapsedTime(stats.cumulative_secs)
83 << ", max: "
84 << tensorflow::strings::HumanReadableElapsedTime(stats.max_secs)
85 << ", #called: " << stats.times_called << ")";
86 enabled_ = false;
87 }
88 }
89
~ScopedLoggingTimer()90 ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }
91
AddStatus(Status prior,absl::string_view context)92 Status AddStatus(Status prior, absl::string_view context) {
93 CHECK(!prior.ok());
94 return Status{prior.code(),
95 absl::StrCat(context, ": ", prior.error_message())};
96 }
97
AppendStatus(Status prior,absl::string_view context)98 Status AppendStatus(Status prior, absl::string_view context) {
99 CHECK(!prior.ok());
100 return Status{prior.code(),
101 absl::StrCat(prior.error_message(), ": ", context)};
102 }
103
Reindent(absl::string_view original,const absl::string_view indentation)104 string Reindent(absl::string_view original,
105 const absl::string_view indentation) {
106 std::vector<string> pieces =
107 absl::StrSplit(absl::string_view(original.data(), original.size()), '\n');
108 return absl::StrJoin(pieces, "\n", [indentation](string* out, string s) {
109 absl::StrAppend(out, indentation, absl::StripAsciiWhitespace(s));
110 });
111 }
112
RoundTripFpToString(tensorflow::bfloat16 value)113 string RoundTripFpToString(tensorflow::bfloat16 value) {
114 return absl::StrFormat("%.4g", static_cast<float>(value));
115 }
116
RoundTripFpToString(Eigen::half value)117 string RoundTripFpToString(Eigen::half value) {
118 return absl::StrFormat("%.5g", static_cast<float>(value));
119 }
120
RoundTripFpToString(float value)121 string RoundTripFpToString(float value) {
122 char buffer[tensorflow::strings::kFastToBufferSize];
123 tensorflow::strings::FloatToBuffer(value, buffer);
124 return buffer;
125 }
126
RoundTripFpToString(double value)127 string RoundTripFpToString(double value) {
128 char buffer[tensorflow::strings::kFastToBufferSize];
129 tensorflow::strings::DoubleToBuffer(value, buffer);
130 return buffer;
131 }
132
MakeNoPaddingConfig(int64 rank)133 PaddingConfig MakeNoPaddingConfig(int64 rank) {
134 PaddingConfig padding_config;
135 for (int64 dnum = 0; dnum < rank; ++dnum) {
136 auto dimension = padding_config.add_dimensions();
137 dimension->set_edge_padding_low(0);
138 dimension->set_edge_padding_high(0);
139 dimension->set_interior_padding(0);
140 }
141 return padding_config;
142 }
143
MakeEdgePaddingConfig(absl::Span<const std::pair<int64,int64>> padding)144 PaddingConfig MakeEdgePaddingConfig(
145 absl::Span<const std::pair<int64, int64>> padding) {
146 PaddingConfig padding_config;
147 for (const std::pair<int64, int64>& dim : padding) {
148 auto dimension = padding_config.add_dimensions();
149 dimension->set_edge_padding_low(dim.first);
150 dimension->set_edge_padding_high(dim.second);
151 dimension->set_interior_padding(0);
152 }
153 return padding_config;
154 }
155
HasInteriorPadding(const PaddingConfig & config)156 bool HasInteriorPadding(const PaddingConfig& config) {
157 for (const auto& dim : config.dimensions()) {
158 if (dim.interior_padding() != 0) {
159 return true;
160 }
161 }
162 return false;
163 }
164
165 namespace {
HumanReadableNumOps(double flops,double nanoseconds,absl::string_view op_prefix)166 string HumanReadableNumOps(double flops, double nanoseconds,
167 absl::string_view op_prefix) {
168 if (nanoseconds == 0) {
169 return absl::StrCat("NaN ", op_prefix, "OP/s");
170 }
171 double nano_flops = flops / nanoseconds;
172 string throughput = tensorflow::strings::HumanReadableNum(
173 static_cast<int64>(nano_flops * 1e9));
174 absl::string_view sp(throughput);
175 // Use the more common "G(FLOPS)", rather than "B(FLOPS)"
176 if (absl::EndsWith(sp, "B") || // Ends in 'B', ignoring case
177 absl::EndsWith(sp, "b")) {
178 *throughput.rbegin() = 'G';
179 }
180 throughput += absl::StrCat(op_prefix, "OP/s");
181 return throughput;
182 }
183 } // namespace
184
HumanReadableNumFlops(double flops,double nanoseconds)185 string HumanReadableNumFlops(double flops, double nanoseconds) {
186 return HumanReadableNumOps(flops, nanoseconds, "FL");
187 }
188
HumanReadableNumTranscendentalOps(double trops,double nanoseconds)189 string HumanReadableNumTranscendentalOps(double trops, double nanoseconds) {
190 return HumanReadableNumOps(trops, nanoseconds, "TR");
191 }
192
LogLines(int sev,absl::string_view text,const char * fname,int lineno)193 void LogLines(int sev, absl::string_view text, const char* fname, int lineno) {
194 const int orig_sev = sev;
195 if (sev == tensorflow::FATAL) {
196 sev = tensorflow::ERROR;
197 }
198
199 // Protect calls with a mutex so we don't interleave calls to LogLines from
200 // multiple threads.
201 static tensorflow::mutex log_lines_mu(tensorflow::LINKER_INITIALIZED);
202 tensorflow::mutex_lock lock(log_lines_mu);
203
204 size_t cur = 0;
205 while (cur < text.size()) {
206 size_t eol = text.find('\n', cur);
207 if (eol == absl::string_view::npos) {
208 eol = text.size();
209 }
210 auto msg = text.substr(cur, eol - cur);
211 tensorflow::internal::LogString(fname, lineno, sev,
212 string(msg.data(), msg.size()));
213 cur = eol + 1;
214 }
215
216 if (orig_sev == tensorflow::FATAL) {
217 tensorflow::internal::LogString(fname, lineno, orig_sev,
218 "Aborting due to errors.");
219 }
220 }
221
Product(absl::Span<const int64> xs)222 int64 Product(absl::Span<const int64> xs) {
223 return std::accumulate(xs.begin(), xs.end(), static_cast<int64>(1),
224 std::multiplies<int64>());
225 }
226
CommonFactors(absl::Span<const int64> a,absl::Span<const int64> b)227 absl::InlinedVector<std::pair<int64, int64>, 8> CommonFactors(
228 absl::Span<const int64> a, absl::Span<const int64> b) {
229 CHECK_EQ(Product(a), Product(b));
230 absl::InlinedVector<std::pair<int64, int64>, 8> bounds;
231 if (absl::c_equal(a, b)) {
232 bounds.reserve(a.size() + 1);
233 for (int64 i = 0; i <= a.size(); ++i) {
234 bounds.emplace_back(i, i);
235 }
236 return bounds;
237 }
238 if (0 == Product(a)) {
239 return {std::make_pair(0, 0), std::make_pair(a.size(), b.size())};
240 }
241
242 for (int64 i = 0, j = 0, prior_i = -1, prior_j = -1, partial_size_a = 1,
243 partial_size_b = 1;
244 ;) {
245 if (partial_size_a == partial_size_b && (i > prior_i || j > prior_j)) {
246 std::tie(prior_i, prior_j) = std::make_pair(i, j);
247 bounds.emplace_back(i, j);
248 continue;
249 }
250 bool in_bounds_i = i < a.size();
251 bool in_bounds_j = j < b.size();
252 if (!(in_bounds_i || in_bounds_j)) {
253 break;
254 }
255 bool next_a =
256 partial_size_a < partial_size_b ||
257 (in_bounds_i &&
258 (!in_bounds_j || (partial_size_a == partial_size_b && a[i] <= b[j])));
259 bool next_b =
260 partial_size_b < partial_size_a ||
261 (in_bounds_j &&
262 (!in_bounds_i || (partial_size_b == partial_size_a && b[j] <= a[i])));
263 if (next_a) {
264 partial_size_a *= a[i];
265 ++i;
266 }
267 if (next_b) {
268 partial_size_b *= b[j];
269 ++j;
270 }
271 }
272 return bounds;
273 }
274
ConvertDimensionNumbers(absl::Span<const int64> from_dimensions,absl::Span<const int64> from_sizes,absl::Span<const int64> to_sizes)275 ConvertedDimensionNumbers ConvertDimensionNumbers(
276 absl::Span<const int64> from_dimensions, absl::Span<const int64> from_sizes,
277 absl::Span<const int64> to_sizes) {
278 ConvertedDimensionNumbers dimensions;
279 auto common_factors = CommonFactors(from_sizes, to_sizes);
280 for (int64 i = 0; i < common_factors.size() - 1; ++i) {
281 bool any_present = false;
282 bool all_present = true;
283 for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
284 ++d) {
285 const bool present = absl::c_linear_search(from_dimensions, d);
286 any_present |= present;
287 all_present &= present;
288 }
289 if (all_present) {
290 for (int64 d = common_factors[i].second; d < common_factors[i + 1].second;
291 ++d) {
292 dimensions.to_dimensions.push_back(d);
293 }
294 for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
295 ++d) {
296 dimensions.transformed_from_dimensions.push_back(d);
297 }
298 } else if (any_present) {
299 for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
300 ++d) {
301 if (absl::c_linear_search(from_dimensions, d)) {
302 dimensions.untransformed_from_dimensions.push_back(d);
303 }
304 }
305 }
306 }
307 return dimensions;
308 }
SanitizeFileName(string file_name)309 string SanitizeFileName(string file_name) {
310 for (char& c : file_name) {
311 if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
312 c = '_';
313 }
314 }
315 return file_name;
316 }
317
318 // Utility function to split a double-precision float (F64) into a pair of F32s.
319 // For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
320 // algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
321 // value 'lo'. See Theorem 4 in [1] (attributed to Dekker) or [2] for the
322 // original theorem by Dekker.
323 //
324 // For double-precision F64s, which contain a 53 bit mantissa (52 of them
325 // explicit), we can represent the most significant 49 digits as the unevaluated
326 // sum of two single-precision floats 'hi' and 'lo'. The 'hi' float stores the
327 // most significant 24 bits and the sign bit of 'lo' together with its mantissa
328 // store the remaining 25 bits. The exponent of the resulting representation is
329 // still restricted to 8 bits of F32.
330 //
331 // References:
332 // [1] A. Thall, Extended-Precision Floating-Point Numbers for GPU Computation,
333 // SIGGRAPH Research Posters, 2006.
334 // (http://andrewthall.org/papers/df64_qf128.pdf)
335 // [2] T. J. Dekker, A floating point technique for extending the available
336 // precision, Numerische Mathematik, vol. 18, pp. 224–242, 1971.
SplitF64ToF32(double x)337 std::pair<float, float> SplitF64ToF32(double x) {
338 const float x_f32 = static_cast<float>(x);
339
340 // Early return if x is an infinity or NaN.
341 if (!std::isfinite(x_f32)) {
342 // Only values within the range of F32 are supported, unless it is infinity.
343 // Small values with large negative exponents would be rounded to zero.
344 if (std::isfinite(x)) {
345 LOG(WARNING) << "Out of range F64 constant detected: " << x;
346 }
347 return std::make_pair(x_f32, 0.0f);
348 }
349
350 // The high float is simply the double rounded to the nearest float. Because
351 // we are rounding to nearest with ties to even, the error introduced in
352 // rounding is less than half an ULP in the high ULP.
353 const float hi = x_f32;
354 // We can compute the low term using Sterbenz' lemma: If a and b are two
355 // positive floating point numbers and a/2 ≤ b ≤ 2a, then their difference can
356 // be computed exactly.
357 // Note: the difference is computed exactly but is rounded to the nearest
358 // float which will introduce additional error.
359 const float lo = static_cast<float>(x - static_cast<double>(hi));
360 return std::make_pair(hi, lo);
361 }
362
363 } // namespace xla
364