1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h"
17
18 #include "absl/algorithm/container.h"
19 #include "absl/strings/str_cat.h"
20 #include "absl/strings/str_format.h"
21 #include "absl/time/time.h"
22 #include "absl/types/optional.h"
23 #include "tensorflow/compiler/xla/literal_util.h"
24 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
25 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
26 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
27 #include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
28 #include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
29 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
30 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
31 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
32 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
33 #include "tensorflow/compiler/xla/status_macros.h"
34 #include "tensorflow/compiler/xla/util.h"
35 #include "tensorflow/core/lib/strings/numbers.h"
36 #include "tensorflow/core/platform/logger.h"
37 #include "tensorflow/core/platform/mutex.h"
38 #include "tensorflow/core/util/env_var.h"
39 #include "tensorflow/core/util/proto/proto_utils.h"
40 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
41
42 namespace xla {
43 namespace gpu {
44 namespace {
45
46 using absl::optional;
47 using se::DeviceMemoryBase;
48 using se::dnn::AlgorithmDesc;
49 using tensorflow::AutotuneResult;
50
51 class ScratchAllocator : public se::ScratchAllocator {
52 public:
ScratchAllocator(int device_ordinal,se::DeviceMemoryAllocator * memory_allocator)53 ScratchAllocator(int device_ordinal,
54 se::DeviceMemoryAllocator* memory_allocator)
55 : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
56
GetMemoryLimitInBytes()57 int64 GetMemoryLimitInBytes() override {
58 return 1LL << 32; // 4GB. TODO(jlebar): Tune this?
59 }
TotalAllocatedBytes()60 int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
61
62 StatusOr<se::DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override;
63
64 template <typename T>
Allocate(int64 num_elements)65 StatusOr<se::DeviceMemory<T>> Allocate(int64 num_elements) {
66 TF_ASSIGN_OR_RETURN(se::DeviceMemory<uint8> bytes,
67 AllocateBytes(num_elements * sizeof(T)));
68 return se::DeviceMemory<T>(bytes);
69 }
70
71 private:
72 const int device_ordinal_;
73 se::DeviceMemoryAllocator* memory_allocator_;
74 std::vector<se::OwningDeviceMemory> allocated_buffers_;
75 int64 total_allocated_bytes_ = 0;
76 };
77
AllocateBytes(int64 byte_size)78 StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
79 int64 byte_size) {
80 CHECK_GE(byte_size, 0) << "byte_size must be positive.";
81 if (byte_size > GetMemoryLimitInBytes()) {
82 return se::port::Status(
83 se::port::error::RESOURCE_EXHAUSTED,
84 absl::StrFormat(
85 "Allocating %d bytes exceeds the memory limit of %d bytes.",
86 byte_size, GetMemoryLimitInBytes()));
87 }
88
89 TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
90 memory_allocator_->Allocate(device_ordinal_, byte_size,
91 /*retry_on_failure=*/false));
92 total_allocated_bytes_ += byte_size;
93
94 se::DeviceMemoryBase buffer_addr = *allocated_buffer;
95 allocated_buffers_.push_back(std::move(allocated_buffer));
96 return se::DeviceMemory<uint8>(buffer_addr);
97 }
98
GetAlgorithms(CudnnConvKind kind,se::StreamExecutor * stream_exec)99 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
100 se::StreamExecutor* stream_exec) {
101 std::vector<AlgorithmDesc> algorithms;
102 bool succ = false;
103 switch (kind) {
104 case CudnnConvKind::kBackwardFilter:
105 succ =
106 stream_exec->GetConvolveBackwardFilterAlgorithms(true, &algorithms);
107 break;
108 case CudnnConvKind::kBackwardInput:
109 succ = stream_exec->GetConvolveBackwardDataAlgorithms(true, &algorithms);
110 break;
111 case CudnnConvKind::kForward:
112 case CudnnConvKind::kForwardActivation:
113 succ = stream_exec->GetConvolveAlgorithms(true, &algorithms);
114 break;
115 }
116 DCHECK(succ);
117
118 return algorithms;
119 }
120
GetAlgorithms(const HloCustomCallInstruction * conv,absl::Span<se::DeviceMemoryBase> operand_buffers,se::DeviceMemoryBase result_buffer,se::StreamExecutor * stream_exec,se::Stream * stream)121 StatusOr<std::vector<se::dnn::ProfileResult>> GetAlgorithms(
122 const HloCustomCallInstruction* conv,
123 absl::Span<se::DeviceMemoryBase> operand_buffers,
124 se::DeviceMemoryBase result_buffer, se::StreamExecutor* stream_exec,
125 se::Stream* stream) {
126 std::vector<se::dnn::ProfileResult> algorithms;
127
128 TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
129 GetDnnConvolutionKind(conv));
130
131 TF_ASSIGN_OR_RETURN(se::dnn::DataType dtype, GetDnnDataType(conv));
132
133 TF_ASSIGN_OR_RETURN(GpuConvParams params,
134 GetGpuConvParams(conv, operand_buffers, result_buffer));
135
136 bool succ = stream_exec->GetMIOpenConvolveAlgorithms(
137 kind, stream, dtype, params.input_descriptor, params.filter_descriptor,
138 params.conv_desc, params.output_descriptor, &algorithms);
139 DCHECK(succ);
140
141 return algorithms;
142 }
143
AlgorithmToString(const AlgorithmDesc & algo)144 string AlgorithmToString(const AlgorithmDesc& algo) {
145 if (algo.tensor_ops_enabled()) {
146 return absl::StrCat(algo.algo_id(), "+TC");
147 }
148 return absl::StrCat(algo.algo_id());
149 }
150
NumBytesToString(int64 bytes)151 string NumBytesToString(int64 bytes) {
152 return absl::StrCat(tensorflow::strings::HumanReadableNumBytes(bytes), " (",
153 bytes, "B)");
154 }
155
GetCudnnVersion(se::StreamExecutor * stream_executor)156 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
157 tensorflow::CudnnVersion cudnn_version;
158 if (auto* dnn = stream_executor->AsDnn()) {
159 StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
160 if (version_or.ok()) {
161 const auto& version = version_or.ValueOrDie();
162 cudnn_version.set_major(version.major_version());
163 cudnn_version.set_minor(version.minor_version());
164 cudnn_version.set_patch(version.patch());
165 }
166 }
167 return cudnn_version;
168 }
169
GetComputeCapability(se::StreamExecutor * stream_executor)170 tensorflow::ComputeCapability GetComputeCapability(
171 se::StreamExecutor* stream_executor) {
172 tensorflow::ComputeCapability cc;
173 int cc_major, cc_minor;
174 stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
175 &cc_minor);
176 cc.set_major(cc_major);
177 cc.set_minor(cc_minor);
178 return cc;
179 }
180
PrintPlatformInfo(const se::Stream * stream)181 void PrintPlatformInfo(const se::Stream* stream) {
182 auto* se = stream->parent();
183 const auto& desc = se->GetDeviceDescription();
184 LOG(ERROR) << "Device: " << desc.name();
185 LOG(ERROR) << "Platform: " << desc.platform_version();
186 LOG(ERROR) << "Driver: " << desc.driver_version();
187 LOG(ERROR) << "Runtime: " << desc.runtime_version();
188
189 auto* dnn = se->AsDnn();
190 if (dnn) {
191 auto dnn_version = dnn->GetVersion();
192 if (dnn_version.ok()) {
193 auto v = dnn_version.ValueOrDie();
194 LOG(ERROR) << "cudnn version: " << v.major_version() << "."
195 << v.minor_version() << "." << v.patch();
196 }
197 }
198 }
199
200 // Returns true if the redzones in `allocator`'s allocations are unmodified.
201 //
202 // If the redzones are modified, logs an error, sets the appropriate failure
203 // bits on `result`, and returns false.
204 //
205 // Returns a status if an unexpected error has occurred, and the stream
206 // has been poisoned.
207 //
208 // `name` is a user-friendly name for the set of redzones being checked, e.g.
209 // "input/output" or "scratch".
CheckRedzones(const se::RedzoneAllocator & allocator,se::Stream * stream,absl::string_view name,const HloInstruction * instr,AutotuneResult * result)210 StatusOr<bool> CheckRedzones(const se::RedzoneAllocator& allocator,
211 se::Stream* stream, absl::string_view name,
212 const HloInstruction* instr,
213 AutotuneResult* result) {
214 XLA_SCOPED_LOGGING_TIMER_LEVEL("CudnnConvAlgorithmPicker checking redzones",
215 2);
216 using RedzoneCheckStatus = se::RedzoneAllocator::RedzoneCheckStatus;
217 TF_ASSIGN_OR_RETURN(RedzoneCheckStatus redzone_check,
218 allocator.CheckRedzones());
219 if (redzone_check.ok()) {
220 return true;
221 }
222
223 auto* fail = result->mutable_failure();
224 fail->set_kind(AutotuneResult::REDZONE_MODIFIED);
225 *fail->mutable_msg() = redzone_check.RedzoneFailureMsg();
226 fail->set_buffer_address(
227 reinterpret_cast<uint64>(redzone_check.user_buffer_address));
228
229 LOG(ERROR) << absl::StreamFormat(
230 "Detected cudnn out-of-bounds write in conv %s buffer! This is likely a "
231 "cudnn bug. We will skip this algorithm in the future, but your GPU "
232 "state may already be corrupted, leading to incorrect results. Within "
233 "Google, no action is needed on your part. Outside of Google, please "
234 "ensure you're running the latest version of cudnn. If that doesn't fix "
235 "the problem, please file a bug with this full error message and we'll "
236 "contact nvidia.",
237 name);
238 LOG(ERROR) << redzone_check.RedzoneFailureMsg();
239 LOG(ERROR) << "HloInstruction " << instr->ToString();
240 PrintPlatformInfo(stream);
241 return false;
242 }
243
244 using ConvCacheKey =
245 std::tuple<se::StreamExecutor*,
246 /* conv->ToString(HloPrintOptions::Canonical()) */ std::string>;
247
248 struct ConvCacheStats {
249 int64 cache_hits = 0;
250 int64 cache_misses = 0;
251
LogStatsxla::gpu::__anona42065910111::ConvCacheStats252 void LogStats() {
253 VLOG(2) << "Cache hits: " << cache_hits;
254 VLOG(2) << "Cache misses: " << cache_misses;
255 }
256 };
257
AutotuneCacheKeyfromInstruction(const HloCustomCallInstruction * conv,se::StreamExecutor * se)258 ConvCacheKey AutotuneCacheKeyfromInstruction(
259 const HloCustomCallInstruction* conv, se::StreamExecutor* se) {
260 auto options = HloPrintOptions::Canonical();
261 options.set_print_backend_config(true);
262 return std::make_tuple(se, conv->ToString(options));
263 }
264
265 tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED);
266 auto& autotune_cache GUARDED_BY(autotune_cache_lock) =
267 *new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
268 auto& autotune_cache_stats GUARDED_BY(autotune_cache_lock) =
269 *new ConvCacheStats();
270 } // anonymous namespace
271
PickBestAlgorithm(const HloCustomCallInstruction * instr)272 StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
273 const HloCustomCallInstruction* instr) {
274 // Don't run this function concurrently on the same GPU.
275 //
276 // This is a bit of a hack and doesn't protect us against arbitrary concurrent
277 // use of a GPU, but it's sufficient to let us compile two HLO modules
278 // concurrently and then run them sequentially.
279 //
280 // Putting the lock in here rather than in PickBestAlgorithmNoCache lets us
281 // avoid ever doing duplicate work. If we have a cache miss, only one thread
282 // will run PickBestAlgorithmImpl for a particular device.
283 tensorflow::mutex_lock lock = LockGpu(stream_exec_);
284
285 // We cache the autotuning results to avoid doing the duplicate work,
286 // which can greatly improve both stability (deterministic numeric results
287 // within a process for a given input) and performance (2x speedup on some
288 // models).
289 ConvCacheKey key = AutotuneCacheKeyfromInstruction(instr, stream_exec_);
290 {
291 tensorflow::mutex_lock lock(autotune_cache_lock);
292 auto it = autotune_cache.find(key);
293 if (it != autotune_cache.end()) {
294 autotune_cache_stats.cache_hits++;
295 return it->second;
296 }
297 autotune_cache_stats.cache_misses++;
298 }
299
300 // Make sure any previous activity on this executor is done. We don't want to
301 // interfere with programs that are still running on the GPU.
302 if (!stream_exec_->SynchronizeAllActivity()) {
303 return InternalError("Failed to synchronize GPU for autotuning.");
304 }
305
306 // allocator either points to this->allocator_ or, if that's null, to a
307 // se::StreamExecutorMemoryAllocator for stream_exec_.
308 se::DeviceMemoryAllocator* allocator;
309 optional<se::StreamExecutorMemoryAllocator> se_allocator;
310 if (allocator_ != nullptr) {
311 allocator = allocator_;
312 } else {
313 se_allocator.emplace(stream_exec_);
314 allocator = &*se_allocator;
315 }
316
317 TF_ASSIGN_OR_RETURN(se::Stream* const stream,
318 allocator->GetStream(stream_exec_->device_ordinal()));
319 StatusOr<AutotuneResult> result_or(InternalError("Unknown platform."));
320 // Check StreamExecutor on which platform it is. ROCm and Cuda implementation
321 // have diverged. Specifically, we need to make sure redzone allocator related
322 // utilities are not used in ROCm routine
323 if (stream_exec_->platform_kind() == se::PlatformKind::kROCm) {
324 result_or = PickBestAlgorithmNoCacheRocm(instr, allocator, stream);
325 } else if (stream_exec_->platform_kind() == se::PlatformKind::kCuda) {
326 result_or = PickBestAlgorithmNoCacheCuda(instr, allocator, stream);
327 }
328
329 if (result_or.ok()) {
330 tensorflow::mutex_lock lock(autotune_cache_lock);
331 CHECK(autotune_cache.insert({key, result_or.ValueOrDie()}).second);
332 }
333 return result_or;
334 }
335
336 // The following function allows deterministic ops to be implemented relatively
337 // quickly using environment variables. It is intended to be temporary. The
338 // longer-term intention is to enable deterministic ops via tf.config and
339 // appropriate plumbing. See the discussion on PR 34951 for more information:
340 // https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
341 // This function and associated comment are replicated in the following three
342 // places:
343 // 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
344 // 2. tensorflow/core/kernels/gpu_utils.cc
345 // 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
346 // When implementing the plumbing, you should also search for the use of
347 // TF_DETERMINISTIC_OPS on its own.
348 // TODO(duncanriach): move to an API that uses tf.config and implement the first
349 // phase of plumbing.
RequireCudnnDeterminism()350 static bool RequireCudnnDeterminism() {
351 static bool require_cudnn_determinism = [] {
352 bool deterministic_ops = false;
353 TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
354 /*default_val=*/false,
355 &deterministic_ops));
356 bool cudnn_deterministic = false;
357 TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
358 /*default_val=*/false,
359 &cudnn_deterministic));
360 return deterministic_ops || cudnn_deterministic;
361 }();
362 return require_cudnn_determinism;
363 }
364
365 StatusOr<tensorflow::AutotuneResult>
PickBestAlgorithmNoCacheCuda(const HloCustomCallInstruction * instr,se::DeviceMemoryAllocator * allocator,se::Stream * stream)366 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
367 const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
368 se::Stream* stream) {
369 // Right now Redzone allocator is available in Cuda target only
370 XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
371 "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
372
373 const Shape& result_shape = instr->shape().tuple_shapes(0);
374 int64 rng_state = 0;
375
376 const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
377 const int32 conv_autotune_level =
378 hlo_module_config.debug_options().xla_gpu_autotune_level();
379 const bool init_conv_data = conv_autotune_level > 1;
380 const bool check_conv = conv_autotune_level > 3;
381 const auto initialize_buffer = [init_conv_data, &stream, &rng_state](
382 DeviceMemoryBase buffer,
383 const Shape& buffer_shape) {
384 if (init_conv_data) {
385 InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
386 }
387 };
388
389 // Allocate space for the input, filter, and output of the convolution.
390 se::RedzoneAllocator input_output_allocator(
391 stream, allocator, PtxOptsFromConfig(hlo_module_config));
392 std::vector<se::DeviceMemoryBase> operand_buffers;
393 for (const auto* operand : instr->operands()) {
394 TF_ASSIGN_OR_RETURN(auto buffer,
395 input_output_allocator.AllocateBytes(
396 ShapeUtil::ByteSizeOf(operand->shape())));
397 initialize_buffer(buffer, operand->shape());
398 operand_buffers.push_back(buffer);
399 }
400 TF_ASSIGN_OR_RETURN(auto result_buffer,
401 input_output_allocator.AllocateBytes(
402 ShapeUtil::ByteSizeOf(result_shape)));
403 initialize_buffer(result_buffer, result_shape);
404
405 TF_ASSIGN_OR_RETURN(auto backend_config,
406 instr->backend_config<CudnnConvBackendConfig>());
407
408 optional<BufferComparator> comparator;
409 // Use the first algorithm that's supported as reference. There isn't a
410 // particular reason to use it, as any algorithm suffices. It doesn't make
411 // this algorithm considered correct, though.
412 se::DeviceMemoryBase reference_result_buffer;
413 AlgorithmDesc first_algorithm;
414
415 TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
416 std::vector<AutotuneResult> profile_results;
417
418 const DebugOptions& debug_options =
419 instr->GetModule()->config().debug_options();
420
421 const bool crash_on_checking_failure =
422 debug_options.xla_gpu_crash_on_verification_failures();
423
424 const auto canonical_hlo =
425 std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
426
427 string blas_version;
428 if (auto* blas = stream_exec_->AsBlas()) {
429 (void)blas->GetVersion(&blas_version);
430 }
431
432 absl::Span<const AlgorithmDesc> blacklisted_algos =
433 GetBlacklistedConvAlgorithms(GetComputeCapability(stream_exec_),
434 GetCudnnVersion(stream_exec_), blas_version,
435 canonical_hlo);
436
437 for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
438 XLA_SCOPED_LOGGING_TIMER_LEVEL(
439 absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
440 AlgorithmToString(alg)),
441 2);
442
443 if (absl::c_linear_search(blacklisted_algos, alg)) {
444 LOG(INFO) << "Omitted potentially buggy algorithm "
445 << AlgorithmToString(alg) << " for conv " << instr->ToString();
446 continue;
447 }
448
449 se::RedzoneAllocator scratch_allocator(
450 stream, allocator, PtxOptsFromConfig(hlo_module_config));
451 se::dnn::ProfileResult profile_result;
452 VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
453 << instr->ToString();
454
455 // Use assignment instead of brace-list to make GCC 4.9 happy.
456 RunConvOptions options;
457 options.profile_result = &profile_result;
458 options.algo_override = alg;
459 Status launch_status =
460 RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
461 &scratch_allocator, stream, options);
462
463 if (!launch_status.ok()) {
464 continue;
465 }
466
467 if (!profile_result.is_valid()) {
468 continue;
469 }
470
471 profile_results.emplace_back();
472 AutotuneResult& result = profile_results.back();
473 result.mutable_conv()->set_algorithm(alg.algo_id());
474 result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
475
476 int64 scratch_bytes_used =
477 scratch_allocator.TotalAllocatedBytesExcludingRedzones();
478 result.set_scratch_bytes(scratch_bytes_used);
479 *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
480 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
481
482 if (!check_conv) {
483 continue;
484 }
485
486 // Check for writes to redzones.
487 TF_ASSIGN_OR_RETURN(bool input_output_allocator_redzone_clear,
488 CheckRedzones(input_output_allocator, stream,
489 "input/output", instr, &result));
490
491 TF_ASSIGN_OR_RETURN(
492 bool scratch_allocator_redzone_clear,
493 CheckRedzones(scratch_allocator, stream, "scratch", instr, &result));
494
495 if (!input_output_allocator_redzone_clear ||
496 !scratch_allocator_redzone_clear) {
497 AlgorithmBlacklist proto;
498 auto entry = proto.add_entries();
499 entry->set_hlo(canonical_hlo);
500 *entry->mutable_cc() = GetComputeCapability(stream_exec_);
501 *entry->mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
502 entry->set_blas_version(blas_version);
503 auto algo = entry->add_algos();
504 algo->set_id(alg.algo_id());
505 algo->set_tensor_ops(alg.tensor_ops_enabled());
506
507 LOG(ERROR)
508 << "To blacklist this algorithm for this convolution, "
509 "copy-paste the following "
510 "proto to the blacklist file pointed by XLA_FLAGS "
511 "--xla_gpu_algorithm_blacklist_path="
512 << GetDebugOptionsFromFlags().xla_gpu_algorithm_blacklist_path()
513 << " : " << proto.ShortDebugString();
514 continue;
515 }
516
517 if (comparator.has_value()) {
518 XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::CompareEqual", 2);
519 StatusOr<bool> compare_result = comparator->CompareEqual(
520 stream, reference_result_buffer, result_buffer);
521 if (!compare_result.ok()) {
522 LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
523 << " against " << AlgorithmToString(alg) << " for "
524 << instr->ToString() << ": " << compare_result.status();
525 if (compare_result.status().code() ==
526 tensorflow::error::RESOURCE_EXHAUSTED) {
527 // Possibly OOM. Propagate the error.
528 return compare_result.status();
529 }
530 CHECK(!crash_on_checking_failure);
531 } else if (!compare_result.ValueOrDie()) {
532 LOG(ERROR)
533 << "Results mismatch between different convolution algorithms. "
534 "This is likely a bug/unexpected loss of precision in cudnn.\n"
535 << instr->ToString() << " for "
536 << AlgorithmToString(first_algorithm) << " vs "
537 << AlgorithmToString(alg);
538 PrintPlatformInfo(stream);
539 VLOG(1) << "Full module on failure: \n"
540 << instr->GetModule()->ToString();
541 auto* fail = result.mutable_failure();
542 fail->set_kind(AutotuneResult::WRONG_RESULT);
543 fail->set_buffer_address(
544 reinterpret_cast<uint64>(result_buffer.opaque()));
545 auto* reference_conv = fail->mutable_reference_conv();
546 reference_conv->set_algorithm(first_algorithm.algo_id());
547 reference_conv->set_tensor_ops_enabled(
548 first_algorithm.tensor_ops_enabled());
549 }
550 } else {
551 XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::Create", 2);
552 comparator.emplace(result_shape, hlo_module_config);
553 TF_ASSIGN_OR_RETURN(
554 reference_result_buffer,
555 input_output_allocator.AllocateBytes(result_buffer.size()));
556 stream->ThenMemcpy(&reference_result_buffer, result_buffer,
557 result_buffer.size());
558 first_algorithm = alg;
559 }
560 }
561
562 // Log the autotuning result.
563 {
564 tensorflow::AutotuningLog log;
565 {
566 ConvInstructionLog instr_log;
567 *instr_log.mutable_instruction() = instr->ToProto();
568 for (int i = 0; i < instr->operand_count(); i++) {
569 *instr_log.add_operand_shapes() = instr->operand(i)->shape().ToProto();
570 instr_log.add_operand_addresses(
571 reinterpret_cast<uint64>(operand_buffers[i].opaque()));
572 }
573 instr_log.set_result_address(
574 reinterpret_cast<uint64>(result_buffer.opaque()));
575 log.mutable_instr()->PackFrom(instr_log);
576 }
577 for (const auto& profile : profile_results) {
578 *log.add_results() = profile;
579 }
580 *log.mutable_compute_capability() = GetComputeCapability(stream_exec_);
581 *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
582 log.set_device_pci_bus_id(
583 stream_exec_->GetDeviceDescription().pci_bus_id());
584 log.set_blas_version(blas_version);
585 VLOG(1) << "Autotuning result: " << log.ShortDebugString();
586 // If we crash on checking failure, we are in a testing/benchmark mode, thus
587 // omitting logging through the logger.
588 if (!crash_on_checking_failure) {
589 tensorflow::Logger::GetSingleton()->LogProto(log);
590 }
591 }
592
593 // Crash on miscompares and redzone violations if desired. Do this after
594 // logging the autotuning results, otherwise we won't get any data!
595 for (const auto& result : profile_results) {
596 if (result.has_failure()) {
597 CHECK(!crash_on_checking_failure);
598 }
599 }
600
601 // Choose the fastest convolution that doesn't produce a REDZONE_MODIFIED
602 // error.
603 //
604 // TODO(jlebar): We ought to be able to detect redzone reads by noticing NaNs
605 // in the output of the conv and skip those.
606 //
607 // For now, we ignore WRONG_RESULT failures because false-positives are
608 // possible (e.g. perhaps the reference algorithm is the one that's
609 // incorrect!). But we don't ignore REDZONE_MODIFIED failures because they're
610 // quite severe and can be detected with high accuracy.
611 std::vector<AutotuneResult> filtered_results;
612 absl::c_copy_if(
613 profile_results, std::back_inserter(filtered_results),
614 [](const AutotuneResult& r) {
615 return !(r.has_failure() &&
616 r.failure().kind() != AutotuneResult::WRONG_RESULT);
617 });
618 if (filtered_results.empty()) {
619 return InternalError(
620 "All algorithms tried for convolution %s failed. Falling back to "
621 "default algorithm. ",
622 instr->ToString());
623 }
624
625 auto selected_result = filtered_results.begin();
626 if (!RequireCudnnDeterminism()) {
627 selected_result = absl::c_min_element(
628 filtered_results,
629 [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
630 return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
631 tensorflow::proto_utils::FromDurationProto(rhs.run_time());
632 });
633 }
634
635 return *selected_result;
636 }
637
638 StatusOr<tensorflow::AutotuneResult>
PickBestAlgorithmNoCacheRocm(const HloCustomCallInstruction * instr,se::DeviceMemoryAllocator * allocator,se::Stream * stream)639 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
640 const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
641 se::Stream* stream) {
642 XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
643 "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
644
645 const auto device_ordinal = stream_exec_->device_ordinal();
646 std::vector<se::DeviceMemoryBase> operand_buffers;
647
648 ScratchAllocator input_output_allocator(device_ordinal, allocator);
649 const auto initialize_buffer = [stream](DeviceMemoryBase buffer) {
650 // Although we don't have evidence this matters, zero out the buffers
651 // before autotuning. It's conceivable that using uninitialized memory as
652 // the inputs might affect performance if e.g. the inputs contain
653 // denormals, and this is easy enough.
654 stream->ThenMemZero(&buffer, buffer.size());
655 };
656
657 // Allocate space for the input, filter, and output of the convolution. We
658 // use a ScratchAllocator for this instead of calling allocator_ directly so
659 // that our allocations don't leak.
660 for (const auto* operand : instr->operands()) {
661 TF_ASSIGN_OR_RETURN(auto buffer,
662 input_output_allocator.AllocateBytes(
663 ShapeUtil::ByteSizeOf(operand->shape())));
664 initialize_buffer(buffer);
665 operand_buffers.push_back(buffer);
666 }
667
668 TF_ASSIGN_OR_RETURN(
669 auto result_buffer,
670 input_output_allocator.AllocateBytes(
671 ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
672 initialize_buffer(result_buffer);
673
674 TF_ASSIGN_OR_RETURN(std::vector<se::dnn::ProfileResult> algorithms,
675 GetAlgorithms(instr, absl::MakeSpan(operand_buffers),
676 result_buffer, stream_exec_, stream));
677
678 std::vector<AutotuneResult> profile_results;
679
680 if (algorithms.size() == 1) {
681 auto profile_result = algorithms[0];
682 profile_results.emplace_back();
683 auto& result = profile_results.back();
684 result.mutable_conv()->set_algorithm(profile_result.algorithm().algo_id());
685 result.mutable_conv()->set_tensor_ops_enabled(
686 profile_result.algorithm().tensor_ops_enabled());
687
688 result.set_scratch_bytes(profile_result.scratch_size());
689 *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
690 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
691 } else {
692 for (const auto& miopen_alg : algorithms) {
693 const auto& alg = miopen_alg.algorithm();
694 XLA_SCOPED_LOGGING_TIMER_LEVEL(
695 absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
696 AlgorithmToString(alg)),
697 2);
698
699 ScratchAllocator scratch_allocator(device_ordinal, allocator);
700 se::dnn::ProfileResult profile_result;
701 VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
702 << instr->ToString();
703
704 // Use assignment instead of brace-list to make GCC 4.9 happy.
705 RunConvOptions options;
706 options.profile_result = &profile_result;
707 options.algo_override = alg;
708 Status launch_status =
709 RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
710 &scratch_allocator, stream, options);
711
712 if (!launch_status.ok()) {
713 continue;
714 }
715
716 if (!profile_result.is_valid()) {
717 continue;
718 }
719
720 profile_results.emplace_back();
721 AutotuneResult& result = profile_results.back();
722 result.mutable_conv()->set_algorithm(alg.algo_id());
723 result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
724
725 int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
726 result.set_scratch_bytes(scratch_bytes_used);
727 *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
728 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
729 }
730 }
731 const auto& best_result = absl::c_min_element(
732 profile_results,
733 [&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
734 return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
735 tensorflow::proto_utils::FromDurationProto(rhs.run_time());
736 });
737
738 if (best_result != profile_results.end()) {
739 return *best_result;
740 }
741
742 return InternalError(
743 "All algorithms tried for convolution %s failed. Falling back to "
744 "default algorithm.",
745 instr->ToString());
746 }
747
RunOnInstruction(HloInstruction * instr)748 StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
749 CHECK(IsCustomCallToDnnConvolution(*instr));
750
751 StatusOr<AutotuneResult> best_algo_or =
752 PickBestAlgorithm(Cast<HloCustomCallInstruction>(instr));
753 if (!best_algo_or.ok()) {
754 LOG(WARNING) << "Failed to determine best cudnn convolution algorithm: "
755 << best_algo_or.status()
756 << "\n\nConvolution performance may be suboptimal.";
757 return false;
758 }
759
760 auto best_algo = std::move(best_algo_or).ValueOrDie();
761 VLOG(2) << "Setting cudnn conv to use algorithm "
762 << best_algo.conv().algorithm() << " and "
763 << NumBytesToString(best_algo.scratch_bytes())
764 << " of scratch memory: " << instr->ToString()
765 << " tensor_ops_enabled: " << best_algo.conv().tensor_ops_enabled();
766
767 // Replace instr with a new CustomCall which has the correct algorithm, and
768 // whose output shape has the appropriate amount of scratch memory.
769 HloComputation* computation = instr->parent();
770 Shape new_call_shape = ShapeUtil::MakeTupleShape(
771 {instr->shape().tuple_shapes(0),
772 ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes()})});
773
774 TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
775 instr->backend_config<CudnnConvBackendConfig>());
776 backend_config.set_algorithm(best_algo.conv().algorithm());
777 backend_config.set_tensor_ops_enabled(best_algo.conv().tensor_ops_enabled());
778
779 HloInstruction* new_call = computation->AddInstruction(
780 instr->CloneWithNewOperands(new_call_shape, instr->operands()));
781
782 VLOG(2) << "Replacing convolution " << instr->ToString() << " with "
783 << new_call->ToString();
784
785 TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
786
787 // Repackage new_call so it has the same shape as the original call, namely
788 // (conv_result, u8[0]).
789 HloInstruction* new_tuple =
790 computation->AddInstruction(HloInstruction::CreateTuple(
791 {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
792 new_call_shape.tuple_shapes(0), new_call, 0)),
793 computation->AddInstruction(HloInstruction::CreateConstant(
794 LiteralUtil::CreateR1<uint8>({})))}));
795
796 TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
797 return true;
798 }
799
RunOnComputation(HloComputation * computation)800 StatusOr<bool> GpuConvAlgorithmPicker::RunOnComputation(
801 HloComputation* computation) {
802 std::vector<HloInstruction*> convs;
803 for (auto* instr : computation->instructions()) {
804 if (IsCustomCallToDnnConvolution(*instr)) {
805 convs.push_back(instr);
806 }
807 }
808
809 bool changed = false;
810 for (auto* instr : convs) {
811 TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
812 changed |= result;
813 }
814 return changed;
815 }
816
Run(HloModule * module)817 StatusOr<bool> GpuConvAlgorithmPicker::Run(HloModule* module) {
818 XLA_SCOPED_LOGGING_TIMER("GpuConvAlgorithmPicker");
819
820 if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
821 VLOG(2) << "Convolution auto-tuning disabled, GpuConvAlgorithmPicker "
822 "returning early.";
823 return false;
824 }
825
826 bool changed = false;
827 for (HloComputation* computation : module->MakeNonfusionComputations()) {
828 TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
829 changed |= result;
830 }
831
832 {
833 tensorflow::mutex_lock lock(autotune_cache_lock);
834 autotune_cache_stats.LogStats();
835 }
836
837 return changed;
838 }
839
840 } // namespace gpu
841 } // namespace xla
842