• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h"
17 
18 #include "absl/algorithm/container.h"
19 #include "absl/strings/str_cat.h"
20 #include "absl/strings/str_format.h"
21 #include "absl/time/time.h"
22 #include "absl/types/optional.h"
23 #include "tensorflow/compiler/xla/literal_util.h"
24 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
25 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
26 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
27 #include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
28 #include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
29 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
30 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
31 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
32 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
33 #include "tensorflow/compiler/xla/status_macros.h"
34 #include "tensorflow/compiler/xla/util.h"
35 #include "tensorflow/core/lib/strings/numbers.h"
36 #include "tensorflow/core/platform/logger.h"
37 #include "tensorflow/core/platform/mutex.h"
38 #include "tensorflow/core/util/env_var.h"
39 #include "tensorflow/core/util/proto/proto_utils.h"
40 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
41 
42 namespace xla {
43 namespace gpu {
44 namespace {
45 
46 using absl::optional;
47 using se::DeviceMemoryBase;
48 using se::dnn::AlgorithmDesc;
49 using tensorflow::AutotuneResult;
50 
51 class ScratchAllocator : public se::ScratchAllocator {
52  public:
ScratchAllocator(int device_ordinal,se::DeviceMemoryAllocator * memory_allocator)53   ScratchAllocator(int device_ordinal,
54                    se::DeviceMemoryAllocator* memory_allocator)
55       : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
56 
GetMemoryLimitInBytes()57   int64 GetMemoryLimitInBytes() override {
58     return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
59   }
TotalAllocatedBytes()60   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
61 
62   StatusOr<se::DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override;
63 
64   template <typename T>
Allocate(int64 num_elements)65   StatusOr<se::DeviceMemory<T>> Allocate(int64 num_elements) {
66     TF_ASSIGN_OR_RETURN(se::DeviceMemory<uint8> bytes,
67                         AllocateBytes(num_elements * sizeof(T)));
68     return se::DeviceMemory<T>(bytes);
69   }
70 
71  private:
72   const int device_ordinal_;
73   se::DeviceMemoryAllocator* memory_allocator_;
74   std::vector<se::OwningDeviceMemory> allocated_buffers_;
75   int64 total_allocated_bytes_ = 0;
76 };
77 
AllocateBytes(int64 byte_size)78 StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
79     int64 byte_size) {
80   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
81   if (byte_size > GetMemoryLimitInBytes()) {
82     return se::port::Status(
83         se::port::error::RESOURCE_EXHAUSTED,
84         absl::StrFormat(
85             "Allocating %d bytes exceeds the memory limit of %d bytes.",
86             byte_size, GetMemoryLimitInBytes()));
87   }
88 
89   TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
90                       memory_allocator_->Allocate(device_ordinal_, byte_size,
91                                                   /*retry_on_failure=*/false));
92   total_allocated_bytes_ += byte_size;
93 
94   se::DeviceMemoryBase buffer_addr = *allocated_buffer;
95   allocated_buffers_.push_back(std::move(allocated_buffer));
96   return se::DeviceMemory<uint8>(buffer_addr);
97 }
98 
GetAlgorithms(CudnnConvKind kind,se::StreamExecutor * stream_exec)99 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
100                                          se::StreamExecutor* stream_exec) {
101   std::vector<AlgorithmDesc> algorithms;
102   bool succ = false;
103   switch (kind) {
104     case CudnnConvKind::kBackwardFilter:
105       succ =
106           stream_exec->GetConvolveBackwardFilterAlgorithms(true, &algorithms);
107       break;
108     case CudnnConvKind::kBackwardInput:
109       succ = stream_exec->GetConvolveBackwardDataAlgorithms(true, &algorithms);
110       break;
111     case CudnnConvKind::kForward:
112     case CudnnConvKind::kForwardActivation:
113       succ = stream_exec->GetConvolveAlgorithms(true, &algorithms);
114       break;
115   }
116   DCHECK(succ);
117 
118   return algorithms;
119 }
120 
GetAlgorithms(const HloCustomCallInstruction * conv,absl::Span<se::DeviceMemoryBase> operand_buffers,se::DeviceMemoryBase result_buffer,se::StreamExecutor * stream_exec,se::Stream * stream)121 StatusOr<std::vector<se::dnn::ProfileResult>> GetAlgorithms(
122     const HloCustomCallInstruction* conv,
123     absl::Span<se::DeviceMemoryBase> operand_buffers,
124     se::DeviceMemoryBase result_buffer, se::StreamExecutor* stream_exec,
125     se::Stream* stream) {
126   std::vector<se::dnn::ProfileResult> algorithms;
127 
128   TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
129                       GetDnnConvolutionKind(conv));
130 
131   TF_ASSIGN_OR_RETURN(se::dnn::DataType dtype, GetDnnDataType(conv));
132 
133   TF_ASSIGN_OR_RETURN(GpuConvParams params,
134                       GetGpuConvParams(conv, operand_buffers, result_buffer));
135 
136   bool succ = stream_exec->GetMIOpenConvolveAlgorithms(
137       kind, stream, dtype, params.input_descriptor, params.filter_descriptor,
138       params.conv_desc, params.output_descriptor, &algorithms);
139   DCHECK(succ);
140 
141   return algorithms;
142 }
143 
AlgorithmToString(const AlgorithmDesc & algo)144 string AlgorithmToString(const AlgorithmDesc& algo) {
145   if (algo.tensor_ops_enabled()) {
146     return absl::StrCat(algo.algo_id(), "+TC");
147   }
148   return absl::StrCat(algo.algo_id());
149 }
150 
NumBytesToString(int64 bytes)151 string NumBytesToString(int64 bytes) {
152   return absl::StrCat(tensorflow::strings::HumanReadableNumBytes(bytes), " (",
153                       bytes, "B)");
154 }
155 
GetCudnnVersion(se::StreamExecutor * stream_executor)156 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
157   tensorflow::CudnnVersion cudnn_version;
158   if (auto* dnn = stream_executor->AsDnn()) {
159     StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
160     if (version_or.ok()) {
161       const auto& version = version_or.ValueOrDie();
162       cudnn_version.set_major(version.major_version());
163       cudnn_version.set_minor(version.minor_version());
164       cudnn_version.set_patch(version.patch());
165     }
166   }
167   return cudnn_version;
168 }
169 
GetComputeCapability(se::StreamExecutor * stream_executor)170 tensorflow::ComputeCapability GetComputeCapability(
171     se::StreamExecutor* stream_executor) {
172   tensorflow::ComputeCapability cc;
173   int cc_major, cc_minor;
174   stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
175                                                                   &cc_minor);
176   cc.set_major(cc_major);
177   cc.set_minor(cc_minor);
178   return cc;
179 }
180 
PrintPlatformInfo(const se::Stream * stream)181 void PrintPlatformInfo(const se::Stream* stream) {
182   auto* se = stream->parent();
183   const auto& desc = se->GetDeviceDescription();
184   LOG(ERROR) << "Device: " << desc.name();
185   LOG(ERROR) << "Platform: " << desc.platform_version();
186   LOG(ERROR) << "Driver: " << desc.driver_version();
187   LOG(ERROR) << "Runtime: " << desc.runtime_version();
188 
189   auto* dnn = se->AsDnn();
190   if (dnn) {
191     auto dnn_version = dnn->GetVersion();
192     if (dnn_version.ok()) {
193       auto v = dnn_version.ValueOrDie();
194       LOG(ERROR) << "cudnn version: " << v.major_version() << "."
195                  << v.minor_version() << "." << v.patch();
196     }
197   }
198 }
199 
200 // Returns true if the redzones in `allocator`'s allocations are unmodified.
201 //
202 // If the redzones are modified, logs an error, sets the appropriate failure
203 // bits on `result`, and returns false.
204 //
205 // Returns a status if an unexpected error has occurred, and the stream
206 // has been poisoned.
207 //
208 // `name` is a user-friendly name for the set of redzones being checked, e.g.
209 // "input/output" or "scratch".
CheckRedzones(const se::RedzoneAllocator & allocator,se::Stream * stream,absl::string_view name,const HloInstruction * instr,AutotuneResult * result)210 StatusOr<bool> CheckRedzones(const se::RedzoneAllocator& allocator,
211                              se::Stream* stream, absl::string_view name,
212                              const HloInstruction* instr,
213                              AutotuneResult* result) {
214   XLA_SCOPED_LOGGING_TIMER_LEVEL("CudnnConvAlgorithmPicker checking redzones",
215                                  2);
216   using RedzoneCheckStatus = se::RedzoneAllocator::RedzoneCheckStatus;
217   TF_ASSIGN_OR_RETURN(RedzoneCheckStatus redzone_check,
218                       allocator.CheckRedzones());
219   if (redzone_check.ok()) {
220     return true;
221   }
222 
223   auto* fail = result->mutable_failure();
224   fail->set_kind(AutotuneResult::REDZONE_MODIFIED);
225   *fail->mutable_msg() = redzone_check.RedzoneFailureMsg();
226   fail->set_buffer_address(
227       reinterpret_cast<uint64>(redzone_check.user_buffer_address));
228 
229   LOG(ERROR) << absl::StreamFormat(
230       "Detected cudnn out-of-bounds write in conv %s buffer! This is likely a "
231       "cudnn bug. We will skip this algorithm in the future, but your GPU "
232       "state may already be corrupted, leading to incorrect results. Within "
233       "Google, no action is needed on your part. Outside of Google, please "
234       "ensure you're running the latest version of cudnn. If that doesn't fix "
235       "the problem, please file a bug with this full error message and we'll "
236       "contact nvidia.",
237       name);
238   LOG(ERROR) << redzone_check.RedzoneFailureMsg();
239   LOG(ERROR) << "HloInstruction " << instr->ToString();
240   PrintPlatformInfo(stream);
241   return false;
242 }
243 
244 using ConvCacheKey =
245     std::tuple<se::StreamExecutor*,
246                /* conv->ToString(HloPrintOptions::Canonical()) */ std::string>;
247 
248 struct ConvCacheStats {
249   int64 cache_hits = 0;
250   int64 cache_misses = 0;
251 
LogStatsxla::gpu::__anona42065910111::ConvCacheStats252   void LogStats() {
253     VLOG(2) << "Cache hits: " << cache_hits;
254     VLOG(2) << "Cache misses: " << cache_misses;
255   }
256 };
257 
AutotuneCacheKeyfromInstruction(const HloCustomCallInstruction * conv,se::StreamExecutor * se)258 ConvCacheKey AutotuneCacheKeyfromInstruction(
259     const HloCustomCallInstruction* conv, se::StreamExecutor* se) {
260   auto options = HloPrintOptions::Canonical();
261   options.set_print_backend_config(true);
262   return std::make_tuple(se, conv->ToString(options));
263 }
264 
265 tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED);
266 auto& autotune_cache GUARDED_BY(autotune_cache_lock) =
267     *new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
268 auto& autotune_cache_stats GUARDED_BY(autotune_cache_lock) =
269     *new ConvCacheStats();
270 }  // anonymous namespace
271 
PickBestAlgorithm(const HloCustomCallInstruction * instr)272 StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
273     const HloCustomCallInstruction* instr) {
274   // Don't run this function concurrently on the same GPU.
275   //
276   // This is a bit of a hack and doesn't protect us against arbitrary concurrent
277   // use of a GPU, but it's sufficient to let us compile two HLO modules
278   // concurrently and then run them sequentially.
279   //
280   // Putting the lock in here rather than in PickBestAlgorithmNoCache lets us
281   // avoid ever doing duplicate work.  If we have a cache miss, only one thread
282   // will run PickBestAlgorithmImpl for a particular device.
283   tensorflow::mutex_lock lock = LockGpu(stream_exec_);
284 
285   // We cache the autotuning results to avoid doing the duplicate work,
286   // which can greatly improve both stability (deterministic numeric results
287   // within a process for a given input) and performance (2x speedup on some
288   // models).
289   ConvCacheKey key = AutotuneCacheKeyfromInstruction(instr, stream_exec_);
290   {
291     tensorflow::mutex_lock lock(autotune_cache_lock);
292     auto it = autotune_cache.find(key);
293     if (it != autotune_cache.end()) {
294       autotune_cache_stats.cache_hits++;
295       return it->second;
296     }
297     autotune_cache_stats.cache_misses++;
298   }
299 
300   // Make sure any previous activity on this executor is done. We don't want to
301   // interfere with programs that are still running on the GPU.
302   if (!stream_exec_->SynchronizeAllActivity()) {
303     return InternalError("Failed to synchronize GPU for autotuning.");
304   }
305 
306   // allocator either points to this->allocator_ or, if that's null, to a
307   // se::StreamExecutorMemoryAllocator for stream_exec_.
308   se::DeviceMemoryAllocator* allocator;
309   optional<se::StreamExecutorMemoryAllocator> se_allocator;
310   if (allocator_ != nullptr) {
311     allocator = allocator_;
312   } else {
313     se_allocator.emplace(stream_exec_);
314     allocator = &*se_allocator;
315   }
316 
317   TF_ASSIGN_OR_RETURN(se::Stream* const stream,
318                       allocator->GetStream(stream_exec_->device_ordinal()));
319   StatusOr<AutotuneResult> result_or(InternalError("Unknown platform."));
320   // Check StreamExecutor on which platform it is. ROCm and Cuda implementation
321   // have diverged. Specifically, we need to make sure redzone allocator related
322   // utilities are not used in ROCm routine
323   if (stream_exec_->platform_kind() == se::PlatformKind::kROCm) {
324     result_or = PickBestAlgorithmNoCacheRocm(instr, allocator, stream);
325   } else if (stream_exec_->platform_kind() == se::PlatformKind::kCuda) {
326     result_or = PickBestAlgorithmNoCacheCuda(instr, allocator, stream);
327   }
328 
329   if (result_or.ok()) {
330     tensorflow::mutex_lock lock(autotune_cache_lock);
331     CHECK(autotune_cache.insert({key, result_or.ValueOrDie()}).second);
332   }
333   return result_or;
334 }
335 
336 // The following function allows deterministic ops to be implemented relatively
337 // quickly using environment variables. It is intended to be temporary. The
338 // longer-term intention is to enable deterministic ops via tf.config and
339 // appropriate plumbing. See the discussion on PR 34951 for more information:
340 // https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
341 // This function and associated comment are replicated in the following three
342 // places:
343 //   1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
344 //   2. tensorflow/core/kernels/gpu_utils.cc
345 //   3. tensorflow/stream_executor/cuda/cuda_dnn.cc
346 // When implementing the plumbing, you should also search for the use of
347 // TF_DETERMINISTIC_OPS on its own.
348 // TODO(duncanriach): move to an API that uses tf.config and implement the first
349 //                    phase of plumbing.
RequireCudnnDeterminism()350 static bool RequireCudnnDeterminism() {
351   static bool require_cudnn_determinism = [] {
352     bool deterministic_ops = false;
353     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
354                                                /*default_val=*/false,
355                                                &deterministic_ops));
356     bool cudnn_deterministic = false;
357     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
358                                                /*default_val=*/false,
359                                                &cudnn_deterministic));
360     return deterministic_ops || cudnn_deterministic;
361   }();
362   return require_cudnn_determinism;
363 }
364 
365 StatusOr<tensorflow::AutotuneResult>
PickBestAlgorithmNoCacheCuda(const HloCustomCallInstruction * instr,se::DeviceMemoryAllocator * allocator,se::Stream * stream)366 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
367     const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
368     se::Stream* stream) {
369   // Right now Redzone allocator is available in Cuda target only
370   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
371       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
372 
373   const Shape& result_shape = instr->shape().tuple_shapes(0);
374   int64 rng_state = 0;
375 
376   const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
377   const int32 conv_autotune_level =
378       hlo_module_config.debug_options().xla_gpu_autotune_level();
379   const bool init_conv_data = conv_autotune_level > 1;
380   const bool check_conv = conv_autotune_level > 3;
381   const auto initialize_buffer = [init_conv_data, &stream, &rng_state](
382                                      DeviceMemoryBase buffer,
383                                      const Shape& buffer_shape) {
384     if (init_conv_data) {
385       InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
386     }
387   };
388 
389   // Allocate space for the input, filter, and output of the convolution.
390   se::RedzoneAllocator input_output_allocator(
391       stream, allocator, PtxOptsFromConfig(hlo_module_config));
392   std::vector<se::DeviceMemoryBase> operand_buffers;
393   for (const auto* operand : instr->operands()) {
394     TF_ASSIGN_OR_RETURN(auto buffer,
395                         input_output_allocator.AllocateBytes(
396                             ShapeUtil::ByteSizeOf(operand->shape())));
397     initialize_buffer(buffer, operand->shape());
398     operand_buffers.push_back(buffer);
399   }
400   TF_ASSIGN_OR_RETURN(auto result_buffer,
401                       input_output_allocator.AllocateBytes(
402                           ShapeUtil::ByteSizeOf(result_shape)));
403   initialize_buffer(result_buffer, result_shape);
404 
405   TF_ASSIGN_OR_RETURN(auto backend_config,
406                       instr->backend_config<CudnnConvBackendConfig>());
407 
408   optional<BufferComparator> comparator;
409   // Use the first algorithm that's supported as reference. There isn't a
410   // particular reason to use it, as any algorithm suffices. It doesn't make
411   // this algorithm considered correct, though.
412   se::DeviceMemoryBase reference_result_buffer;
413   AlgorithmDesc first_algorithm;
414 
415   TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
416   std::vector<AutotuneResult> profile_results;
417 
418   const DebugOptions& debug_options =
419       instr->GetModule()->config().debug_options();
420 
421   const bool crash_on_checking_failure =
422       debug_options.xla_gpu_crash_on_verification_failures();
423 
424   const auto canonical_hlo =
425       std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
426 
427   string blas_version;
428   if (auto* blas = stream_exec_->AsBlas()) {
429     (void)blas->GetVersion(&blas_version);
430   }
431 
432   absl::Span<const AlgorithmDesc> blacklisted_algos =
433       GetBlacklistedConvAlgorithms(GetComputeCapability(stream_exec_),
434                                    GetCudnnVersion(stream_exec_), blas_version,
435                                    canonical_hlo);
436 
437   for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
438     XLA_SCOPED_LOGGING_TIMER_LEVEL(
439         absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
440                      AlgorithmToString(alg)),
441         2);
442 
443     if (absl::c_linear_search(blacklisted_algos, alg)) {
444       LOG(INFO) << "Omitted potentially buggy algorithm "
445                 << AlgorithmToString(alg) << " for conv " << instr->ToString();
446       continue;
447     }
448 
449     se::RedzoneAllocator scratch_allocator(
450         stream, allocator, PtxOptsFromConfig(hlo_module_config));
451     se::dnn::ProfileResult profile_result;
452     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
453             << instr->ToString();
454 
455     // Use assignment instead of brace-list to make GCC 4.9 happy.
456     RunConvOptions options;
457     options.profile_result = &profile_result;
458     options.algo_override = alg;
459     Status launch_status =
460         RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
461                    &scratch_allocator, stream, options);
462 
463     if (!launch_status.ok()) {
464       continue;
465     }
466 
467     if (!profile_result.is_valid()) {
468       continue;
469     }
470 
471     profile_results.emplace_back();
472     AutotuneResult& result = profile_results.back();
473     result.mutable_conv()->set_algorithm(alg.algo_id());
474     result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
475 
476     int64 scratch_bytes_used =
477         scratch_allocator.TotalAllocatedBytesExcludingRedzones();
478     result.set_scratch_bytes(scratch_bytes_used);
479     *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
480         absl::Milliseconds(profile_result.elapsed_time_in_ms()));
481 
482     if (!check_conv) {
483       continue;
484     }
485 
486     // Check for writes to redzones.
487     TF_ASSIGN_OR_RETURN(bool input_output_allocator_redzone_clear,
488                         CheckRedzones(input_output_allocator, stream,
489                                       "input/output", instr, &result));
490 
491     TF_ASSIGN_OR_RETURN(
492         bool scratch_allocator_redzone_clear,
493         CheckRedzones(scratch_allocator, stream, "scratch", instr, &result));
494 
495     if (!input_output_allocator_redzone_clear ||
496         !scratch_allocator_redzone_clear) {
497       AlgorithmBlacklist proto;
498       auto entry = proto.add_entries();
499       entry->set_hlo(canonical_hlo);
500       *entry->mutable_cc() = GetComputeCapability(stream_exec_);
501       *entry->mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
502       entry->set_blas_version(blas_version);
503       auto algo = entry->add_algos();
504       algo->set_id(alg.algo_id());
505       algo->set_tensor_ops(alg.tensor_ops_enabled());
506 
507       LOG(ERROR)
508           << "To blacklist this algorithm for this convolution, "
509              "copy-paste the following "
510              "proto to the blacklist file pointed by XLA_FLAGS "
511              "--xla_gpu_algorithm_blacklist_path="
512           << GetDebugOptionsFromFlags().xla_gpu_algorithm_blacklist_path()
513           << " : " << proto.ShortDebugString();
514       continue;
515     }
516 
517     if (comparator.has_value()) {
518       XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::CompareEqual", 2);
519       StatusOr<bool> compare_result = comparator->CompareEqual(
520           stream, reference_result_buffer, result_buffer);
521       if (!compare_result.ok()) {
522         LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
523                    << " against " << AlgorithmToString(alg) << " for "
524                    << instr->ToString() << ": " << compare_result.status();
525         if (compare_result.status().code() ==
526             tensorflow::error::RESOURCE_EXHAUSTED) {
527           // Possibly OOM. Propagate the error.
528           return compare_result.status();
529         }
530         CHECK(!crash_on_checking_failure);
531       } else if (!compare_result.ValueOrDie()) {
532         LOG(ERROR)
533             << "Results mismatch between different convolution algorithms. "
534                "This is likely a bug/unexpected loss of precision in cudnn.\n"
535             << instr->ToString() << " for "
536             << AlgorithmToString(first_algorithm) << " vs "
537             << AlgorithmToString(alg);
538         PrintPlatformInfo(stream);
539         VLOG(1) << "Full module on failure: \n"
540                 << instr->GetModule()->ToString();
541         auto* fail = result.mutable_failure();
542         fail->set_kind(AutotuneResult::WRONG_RESULT);
543         fail->set_buffer_address(
544             reinterpret_cast<uint64>(result_buffer.opaque()));
545         auto* reference_conv = fail->mutable_reference_conv();
546         reference_conv->set_algorithm(first_algorithm.algo_id());
547         reference_conv->set_tensor_ops_enabled(
548             first_algorithm.tensor_ops_enabled());
549       }
550     } else {
551       XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::Create", 2);
552       comparator.emplace(result_shape, hlo_module_config);
553       TF_ASSIGN_OR_RETURN(
554           reference_result_buffer,
555           input_output_allocator.AllocateBytes(result_buffer.size()));
556       stream->ThenMemcpy(&reference_result_buffer, result_buffer,
557                          result_buffer.size());
558       first_algorithm = alg;
559     }
560   }
561 
562   // Log the autotuning result.
563   {
564     tensorflow::AutotuningLog log;
565     {
566       ConvInstructionLog instr_log;
567       *instr_log.mutable_instruction() = instr->ToProto();
568       for (int i = 0; i < instr->operand_count(); i++) {
569         *instr_log.add_operand_shapes() = instr->operand(i)->shape().ToProto();
570         instr_log.add_operand_addresses(
571             reinterpret_cast<uint64>(operand_buffers[i].opaque()));
572       }
573       instr_log.set_result_address(
574           reinterpret_cast<uint64>(result_buffer.opaque()));
575       log.mutable_instr()->PackFrom(instr_log);
576     }
577     for (const auto& profile : profile_results) {
578       *log.add_results() = profile;
579     }
580     *log.mutable_compute_capability() = GetComputeCapability(stream_exec_);
581     *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
582     log.set_device_pci_bus_id(
583         stream_exec_->GetDeviceDescription().pci_bus_id());
584     log.set_blas_version(blas_version);
585     VLOG(1) << "Autotuning result: " << log.ShortDebugString();
586     // If we crash on checking failure, we are in a testing/benchmark mode, thus
587     // omitting logging through the logger.
588     if (!crash_on_checking_failure) {
589       tensorflow::Logger::GetSingleton()->LogProto(log);
590     }
591   }
592 
593   // Crash on miscompares and redzone violations if desired.  Do this after
594   // logging the autotuning results, otherwise we won't get any data!
595   for (const auto& result : profile_results) {
596     if (result.has_failure()) {
597       CHECK(!crash_on_checking_failure);
598     }
599   }
600 
601   // Choose the fastest convolution that doesn't produce a REDZONE_MODIFIED
602   // error.
603   //
604   // TODO(jlebar): We ought to be able to detect redzone reads by noticing NaNs
605   // in the output of the conv and skip those.
606   //
607   // For now, we ignore WRONG_RESULT failures because false-positives are
608   // possible (e.g. perhaps the reference algorithm is the one that's
609   // incorrect!).  But we don't ignore REDZONE_MODIFIED failures because they're
610   // quite severe and can be detected with high accuracy.
611   std::vector<AutotuneResult> filtered_results;
612   absl::c_copy_if(
613       profile_results, std::back_inserter(filtered_results),
614       [](const AutotuneResult& r) {
615         return !(r.has_failure() &&
616                  r.failure().kind() != AutotuneResult::WRONG_RESULT);
617       });
618   if (filtered_results.empty()) {
619     return InternalError(
620         "All algorithms tried for convolution %s failed. Falling back to "
621         "default algorithm. ",
622         instr->ToString());
623   }
624 
625   auto selected_result = filtered_results.begin();
626   if (!RequireCudnnDeterminism()) {
627     selected_result = absl::c_min_element(
628         filtered_results,
629         [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
630           return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
631                  tensorflow::proto_utils::FromDurationProto(rhs.run_time());
632         });
633   }
634 
635   return *selected_result;
636 }
637 
638 StatusOr<tensorflow::AutotuneResult>
PickBestAlgorithmNoCacheRocm(const HloCustomCallInstruction * instr,se::DeviceMemoryAllocator * allocator,se::Stream * stream)639 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
640     const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
641     se::Stream* stream) {
642   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
643       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
644 
645   const auto device_ordinal = stream_exec_->device_ordinal();
646   std::vector<se::DeviceMemoryBase> operand_buffers;
647 
648   ScratchAllocator input_output_allocator(device_ordinal, allocator);
649   const auto initialize_buffer = [stream](DeviceMemoryBase buffer) {
650     // Although we don't have evidence this matters, zero out the buffers
651     // before autotuning.  It's conceivable that using uninitialized memory as
652     // the inputs might affect performance if e.g. the inputs contain
653     // denormals, and this is easy enough.
654     stream->ThenMemZero(&buffer, buffer.size());
655   };
656 
657   // Allocate space for the input, filter, and output of the convolution.  We
658   // use a ScratchAllocator for this instead of calling allocator_ directly so
659   // that our allocations don't leak.
660   for (const auto* operand : instr->operands()) {
661     TF_ASSIGN_OR_RETURN(auto buffer,
662                         input_output_allocator.AllocateBytes(
663                             ShapeUtil::ByteSizeOf(operand->shape())));
664     initialize_buffer(buffer);
665     operand_buffers.push_back(buffer);
666   }
667 
668   TF_ASSIGN_OR_RETURN(
669       auto result_buffer,
670       input_output_allocator.AllocateBytes(
671           ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
672   initialize_buffer(result_buffer);
673 
674   TF_ASSIGN_OR_RETURN(std::vector<se::dnn::ProfileResult> algorithms,
675                       GetAlgorithms(instr, absl::MakeSpan(operand_buffers),
676                                     result_buffer, stream_exec_, stream));
677 
678   std::vector<AutotuneResult> profile_results;
679 
680   if (algorithms.size() == 1) {
681     auto profile_result = algorithms[0];
682     profile_results.emplace_back();
683     auto& result = profile_results.back();
684     result.mutable_conv()->set_algorithm(profile_result.algorithm().algo_id());
685     result.mutable_conv()->set_tensor_ops_enabled(
686         profile_result.algorithm().tensor_ops_enabled());
687 
688     result.set_scratch_bytes(profile_result.scratch_size());
689     *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
690         absl::Milliseconds(profile_result.elapsed_time_in_ms()));
691   } else {
692     for (const auto& miopen_alg : algorithms) {
693       const auto& alg = miopen_alg.algorithm();
694       XLA_SCOPED_LOGGING_TIMER_LEVEL(
695           absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
696                        AlgorithmToString(alg)),
697           2);
698 
699       ScratchAllocator scratch_allocator(device_ordinal, allocator);
700       se::dnn::ProfileResult profile_result;
701       VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
702               << instr->ToString();
703 
704       // Use assignment instead of brace-list to make GCC 4.9 happy.
705       RunConvOptions options;
706       options.profile_result = &profile_result;
707       options.algo_override = alg;
708       Status launch_status =
709           RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
710                      &scratch_allocator, stream, options);
711 
712       if (!launch_status.ok()) {
713         continue;
714       }
715 
716       if (!profile_result.is_valid()) {
717         continue;
718       }
719 
720       profile_results.emplace_back();
721       AutotuneResult& result = profile_results.back();
722       result.mutable_conv()->set_algorithm(alg.algo_id());
723       result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
724 
725       int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
726       result.set_scratch_bytes(scratch_bytes_used);
727       *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
728           absl::Milliseconds(profile_result.elapsed_time_in_ms()));
729     }
730   }
731   const auto& best_result = absl::c_min_element(
732       profile_results,
733       [&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
734         return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
735                tensorflow::proto_utils::FromDurationProto(rhs.run_time());
736       });
737 
738   if (best_result != profile_results.end()) {
739     return *best_result;
740   }
741 
742   return InternalError(
743       "All algorithms tried for convolution %s failed.  Falling back to "
744       "default algorithm.",
745       instr->ToString());
746 }
747 
RunOnInstruction(HloInstruction * instr)748 StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
749   CHECK(IsCustomCallToDnnConvolution(*instr));
750 
751   StatusOr<AutotuneResult> best_algo_or =
752       PickBestAlgorithm(Cast<HloCustomCallInstruction>(instr));
753   if (!best_algo_or.ok()) {
754     LOG(WARNING) << "Failed to determine best cudnn convolution algorithm: "
755                  << best_algo_or.status()
756                  << "\n\nConvolution performance may be suboptimal.";
757     return false;
758   }
759 
760   auto best_algo = std::move(best_algo_or).ValueOrDie();
761   VLOG(2) << "Setting cudnn conv to use algorithm "
762           << best_algo.conv().algorithm() << " and "
763           << NumBytesToString(best_algo.scratch_bytes())
764           << " of scratch memory: " << instr->ToString()
765           << " tensor_ops_enabled: " << best_algo.conv().tensor_ops_enabled();
766 
767   // Replace instr with a new CustomCall which has the correct algorithm, and
768   // whose output shape has the appropriate amount of scratch memory.
769   HloComputation* computation = instr->parent();
770   Shape new_call_shape = ShapeUtil::MakeTupleShape(
771       {instr->shape().tuple_shapes(0),
772        ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes()})});
773 
774   TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
775                       instr->backend_config<CudnnConvBackendConfig>());
776   backend_config.set_algorithm(best_algo.conv().algorithm());
777   backend_config.set_tensor_ops_enabled(best_algo.conv().tensor_ops_enabled());
778 
779   HloInstruction* new_call = computation->AddInstruction(
780       instr->CloneWithNewOperands(new_call_shape, instr->operands()));
781 
782   VLOG(2) << "Replacing convolution " << instr->ToString() << " with "
783           << new_call->ToString();
784 
785   TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
786 
787   // Repackage new_call so it has the same shape as the original call, namely
788   // (conv_result, u8[0]).
789   HloInstruction* new_tuple =
790       computation->AddInstruction(HloInstruction::CreateTuple(
791           {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
792                new_call_shape.tuple_shapes(0), new_call, 0)),
793            computation->AddInstruction(HloInstruction::CreateConstant(
794                LiteralUtil::CreateR1<uint8>({})))}));
795 
796   TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
797   return true;
798 }
799 
RunOnComputation(HloComputation * computation)800 StatusOr<bool> GpuConvAlgorithmPicker::RunOnComputation(
801     HloComputation* computation) {
802   std::vector<HloInstruction*> convs;
803   for (auto* instr : computation->instructions()) {
804     if (IsCustomCallToDnnConvolution(*instr)) {
805       convs.push_back(instr);
806     }
807   }
808 
809   bool changed = false;
810   for (auto* instr : convs) {
811     TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
812     changed |= result;
813   }
814   return changed;
815 }
816 
Run(HloModule * module)817 StatusOr<bool> GpuConvAlgorithmPicker::Run(HloModule* module) {
818   XLA_SCOPED_LOGGING_TIMER("GpuConvAlgorithmPicker");
819 
820   if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
821     VLOG(2) << "Convolution auto-tuning disabled, GpuConvAlgorithmPicker "
822                "returning early.";
823     return false;
824   }
825 
826   bool changed = false;
827   for (HloComputation* computation : module->MakeNonfusionComputations()) {
828     TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
829     changed |= result;
830   }
831 
832   {
833     tensorflow::mutex_lock lock(autotune_cache_lock);
834     autotune_cache_stats.LogStats();
835   }
836 
837   return changed;
838 }
839 
840 }  // namespace gpu
841 }  // namespace xla
842