• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
17 
18 #include <stdint.h>
19 #include <stdlib.h>
20 
21 #include <map>
22 #include <set>
23 #include <utility>
24 
25 #include "absl/base/casts.h"
26 #include "absl/base/const_init.h"
27 #include "absl/container/inlined_vector.h"
28 #include "absl/debugging/leak_check.h"
29 #include "absl/memory/memory.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/synchronization/mutex.h"
33 #include "absl/synchronization/notification.h"
34 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
35 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
36 #include "tensorflow/stream_executor/lib/env.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/human_readable.h"
39 #include "tensorflow/stream_executor/lib/stacktrace.h"
40 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
41 #include "tensorflow/stream_executor/lib/threadpool.h"
42 #include "tensorflow/stream_executor/platform/logging.h"
43 #include "tensorflow/stream_executor/platform/port.h"
44 
45 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
46 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
47 bool FLAGS_gpuexec_cuda_device_0_only = false;
48 
49 #define RETURN_IF_CUDA_RES_ERROR(expr, ...)                            \
50   do {                                                                 \
51     CUresult _res = (expr);                                            \
52     if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) {                      \
53       return port::InternalError(absl::StrCat(                         \
54           __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
55     }                                                                  \
56   } while (0)
57 
58 #define FAIL_IF_CUDA_RES_ERROR(expr, ...)                   \
59   do {                                                      \
60     CUresult _res = (expr);                                 \
61     if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) {           \
62       LOG(FATAL) << absl::StrCat(__VA_ARGS__) << ": "       \
63                  << ::stream_executor::gpu::ToString(_res); \
64     }                                                       \
65   } while (0)
66 
67 // Debugging: on each push and pop of a cuda context, verify the current context
68 // matches the expected one.
69 constexpr bool kVerifyGpuContext = false;
70 
71 namespace stream_executor {
72 namespace gpu {
73 
74 /* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
75 /* static */ int64_t CreatedContexts::next_id_ = 1;  // 0 means "no context"
76 
77 namespace {
78 
UseCudaMallocAsyncAllocator()79 bool UseCudaMallocAsyncAllocator() {
80   static const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
81   return debug_allocator_str != nullptr &&
82          std::strcmp(debug_allocator_str, "cuda_malloc_async") == 0;
83 }
84 
85 // Returns the current context and checks that it is in the set of CUDA contexts
86 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
87 // context behind our backs).
CurrentContext()88 CUcontext CurrentContext() {
89   CUcontext current = cuda::CurrentContextOrDie();
90   if (current != nullptr && !CreatedContexts::Has(current)) {
91     LOG(FATAL) << "current context was not created by the StreamExecutor "
92                   "cuda_driver API: "
93                << current
94                << "; a CUDA runtime call "
95                   "was likely performed without using a StreamExecutor context";
96   }
97   return current;
98 }
99 
100 // CUDA driver routines may require a large amount of stack (particularly
101 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
102 // stack-limited threads (such as those spawned by a default-argument
103 // thread::ThreadPool on some platforms), we run certain routines in this pool
104 // and wait for completion.
GetDriverExecutor()105 port::ThreadPool* GetDriverExecutor() {
106   static port::ThreadPool* thread_pool = new port::ThreadPool(
107       port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
108   return thread_pool;
109 }
110 
111 }  // namespace
112 
MemorySpaceString(MemorySpace memory_space)113 std::string MemorySpaceString(MemorySpace memory_space) {
114   switch (memory_space) {
115     case MemorySpace::kHost:
116       return "host";
117     case MemorySpace::kDevice:
118       return "device";
119     default:
120       LOG(FATAL) << "impossible memory space";
121   }
122 }
123 
124 namespace {
125 
126 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()127 void SynchronizeOrDie() {
128   FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
129                          "Synchronize fail: ", port::CurrentStackTrace());
130 }
131 
132 struct ThreadLocalData {
133   int64 id;
134   GpuContext* context;  // Only valid if id == a known good context.
135   int depth;
136 };
137 
138 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
139 
140 }  // namespace
141 
ScopedActivateContext(GpuContext * cuda_context)142 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
143   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
144 
145   auto* tls = &tls_data.get();
146 
147   // If this is an outermost scope, we must not assume that the CUDA context has
148   // been left in the same state we left it. Other code may have run on this
149   // thread and altered the context.
150   if (tls->depth == 0) {
151     VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
152     FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
153                            "Failed setting context");
154     tls->depth = 1;
155     tls->id = cuda_context->id();
156     tls->context = cuda_context;
157     to_restore_ = nullptr;
158     return;
159   }
160 
161   tls->depth++;
162   if (tls->id == cuda_context->id()) {
163     if (kVerifyGpuContext) {
164       CHECK_EQ(CurrentContext(), cuda_context->context());
165     }
166     DCHECK_EQ(CurrentContext(), cuda_context->context());
167     return;
168   }
169 
170   VLOG(3) << "ScopedActivateContext switching context from " << tls->id
171           << " to " << cuda_context->id();
172 
173   to_restore_ = tls->context;
174   // Set the context and update thread local.
175   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
176                          "Failed setting context");
177   tls->id = cuda_context->id();
178   tls->context = cuda_context;
179 }
180 
~ScopedActivateContext()181 ScopedActivateContext::~ScopedActivateContext() {
182   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
183 
184   auto* tls = &tls_data.get();
185 
186   if (kVerifyGpuContext) {
187     // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
188     // possible this could fail in the CurrentContext() call.
189     CHECK_EQ(CurrentContext(),
190              tls->context == nullptr ? nullptr : tls->context->context());
191   }
192 
193   tls->depth--;
194   DCHECK_GE(tls->depth, 0);
195   if (to_restore_ == nullptr) {
196     // Leave context, tls->id, and tls->context set.
197     return;
198   }
199 
200   // Set context and update thread local.
201   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(to_restore_->context()),
202                          "Failed setting context");
203   tls->id = to_restore_->id();
204   tls->context = to_restore_;
205 }
206 
207 namespace {
208 
209 // Returns a stringified device number associated with pointer, primarily for
210 // logging purposes. Returns "?" if the device could not be successfully
211 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)212 std::string CUDAPointerToDeviceString(CUdeviceptr pointer) {
213   auto value = GpuDriver::GetPointerDevice(pointer);
214   if (value.ok()) {
215     return absl::StrCat(value.ValueOrDie());
216   }
217   LOG(ERROR) << "could not query device: " << value.status();
218   return "?";
219 }
220 
221 // Returns a stringified memory space associated with pointer, primarily for
222 // logging purposes. Returns "?" if the memory space could not be successfully
223 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)224 std::string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
225   auto value = GpuDriver::GetPointerMemorySpace(pointer);
226   if (value.ok()) {
227     return MemorySpaceString(value.ValueOrDie());
228   }
229   LOG(ERROR) << "could not query device: " << value.status();
230   return "?";
231 }
232 
233 // Returns a stringified representation of whether or not peer access is
234 // permitted between the "from" and "to" pointers' associated contexts,
235 // primarily for logging purposes. Returns "error" if an error is encountered
236 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)237 std::string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
238   auto from_context = GpuDriver::GetPointerContext(from);
239   if (!from_context.ok()) {
240     LOG(ERROR) << "could not retrieve source pointer's context: "
241                << from_context.status();
242     return "error";
243   }
244   auto to_context = GpuDriver::GetPointerContext(to);
245   if (!to_context.ok()) {
246     LOG(ERROR) << "could not retrieve destination pointer's context: "
247                << to_context.status();
248     return "error";
249   }
250   return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
251                                         to_context.ValueOrDie())
252              ? "true"
253              : "false";
254 }
255 
256 // Actually performs the work of CUDA initialization. Wrapped up in one-time
257 // execution guard.
InternalInit()258 static port::Status InternalInit() {
259   CUresult res = CUDA_ERROR_NO_DEVICE;
260   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
261     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
262   } else {
263     res = cuInit(0 /* = flags */);
264   }
265 
266   if (res == CUDA_SUCCESS) {
267     return port::Status::OK();
268   } else if (res == CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) {
269     LOG(WARNING) << "failed call to cuInit: " << ToString(res);
270   } else {
271     LOG(ERROR) << "failed call to cuInit: " << ToString(res);
272   }
273 
274   Diagnostician::LogDiagnosticInformation();
275   return port::Status(port::error::ABORTED,
276                       absl::StrCat("failed call to cuInit: ", ToString(res)));
277 }
278 
279 }  // namespace
280 
Init()281 /* static */ port::Status GpuDriver::Init() {
282   // Cached return value from calling InternalInit(), as cuInit need only be
283   // called once, but GpuDriver::Init may be called many times.
284   static port::Status* init_retval = [] {
285     return new port::Status(InternalInit());
286   }();
287   return *init_retval;
288 }
289 
GetDevice(int device_ordinal,CUdevice * device)290 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
291                                                CUdevice* device) {
292   RETURN_IF_CUDA_RES_ERROR(cuDeviceGet(device, device_ordinal),
293                            "Failed call to cuDeviceGet");
294   return port::Status::OK();
295 }
296 
GetDeviceName(CUdevice device,std::string * device_name)297 /* static */ port::Status GpuDriver::GetDeviceName(CUdevice device,
298                                                    std::string* device_name) {
299   static const size_t kCharLimit = 64;
300   absl::InlinedVector<char, 4> chars(kCharLimit);
301   RETURN_IF_CUDA_RES_ERROR(
302       cuDeviceGetName(chars.begin(), kCharLimit - 1, device),
303       "Failed to get device name");
304   chars[kCharLimit - 1] = '\0';
305   *device_name = chars.begin();
306   return port::Status::OK();
307 }
308 
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)309 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
310                                  int* flags) {
311   static_assert(DeviceOptions::kMask == 0xf,
312                 "needs update for new device options");
313 
314   if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
315     *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
316   }
317 
318   // If no flags are set the default is CU_CTX_SCHED_AUTO, which
319   // in Google environments is very likely to mean SPIN.
320   if (device_options.flags() & DeviceOptions::kScheduleSpin) {
321     *flags |= CU_CTX_SCHED_SPIN;
322   }
323   if (device_options.flags() & DeviceOptions::kScheduleYield) {
324     *flags |= CU_CTX_SCHED_YIELD;
325   }
326   if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
327     *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
328   }
329 
330   return true;
331 }
332 
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)333 /* static */ port::Status GpuDriver::CreateContext(
334     int device_ordinal, CUdevice device, const DeviceOptions& device_options,
335     GpuContext** context) {
336   *context = nullptr;
337 
338   int flags = 0;
339   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
340     LOG(WARNING) << "could not convert all device options into context flags";
341   }
342 
343   CUresult res;
344   CUcontext former_context;
345   CUcontext new_context;
346 
347   unsigned int former_primary_context_flags;
348   int former_primary_context_is_active;
349   CHECK_EQ(CUDA_SUCCESS,
350            cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
351                                       &former_primary_context_is_active));
352   if (former_primary_context_flags != flags) {
353     if (former_primary_context_is_active) {
354       LOG(ERROR)
355           << "The primary context is active and has a different flag set ("
356           << former_primary_context_flags << ") than the desired flag set ("
357           << flags << ").";
358     } else {
359       CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
360     }
361   }
362 
363   former_context = cuda::CurrentContextOrDie();
364   res = cuDevicePrimaryCtxRetain(&new_context, device);
365   if (former_context != nullptr) {
366     CUdevice former_device;
367     if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
368       if (former_device == device) {
369         if (former_context == new_context) {
370           VLOG(2) << "The primary context " << former_context << " for device "
371                   << device
372                   << " exists before initializing the StreamExecutor.";
373         } else {
374           LOG(WARNING) << "A non-primary context " << former_context
375                        << " for device " << device
376                        << " exists before initializing the StreamExecutor. The "
377                        << "primary context is now " << new_context << ". We "
378                        << "haven't verified StreamExecutor works with that.";
379         }
380       }
381     } else {
382       LOG(ERROR) << "Failed to get the device of the current context "
383                  << former_context;
384     }
385   }
386   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
387 
388   if (res == CUDA_SUCCESS) {
389     *context = CreatedContexts::Add(new_context, device_ordinal);
390     CHECK(*context != nullptr)
391         << "success in this call must entail non-null result";
392     VLOG(2) << "created or reused context " << new_context
393             << " for this thread";
394     return port::Status::OK();
395   }
396 
397   std::string message =
398       "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
399   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
400     uint64 total_memory;
401     if (GetDeviceTotalMemory(device, &total_memory)) {
402       absl::StrAppend(&message, "; total memory reported: ", total_memory);
403     } else {
404       absl::StrAppend(&message, "; could not query total memory");
405     }
406   }
407 
408   return port::Status(port::error::INTERNAL, message);
409 }
410 
DestroyContext(GpuContext * context)411 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
412   if (context == nullptr) {
413     return;
414   }
415   CUcontext former_context = CurrentContext();
416   CUresult res = cuCtxSetCurrent(context->context());
417   CUdevice device;
418   cuCtxGetDevice(&device);
419   cuCtxSetCurrent(former_context);
420 
421   res = cuDevicePrimaryCtxRelease(device);
422 
423   if (res != CUDA_SUCCESS) {
424     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
425   }
426 
427   CreatedContexts::Remove(context->context());
428 }
429 
GetContextHandle(GpuContext * context)430 /* static */ CUcontext GpuDriver::GetContextHandle(GpuContext* context) {
431   return context->context();
432 }
433 
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)434 /* static */ port::Status GpuDriver::FuncGetAttribute(
435     CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
436   RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
437                            "Failed to query kernel attribute: ", attribute);
438   return port::Status::OK();
439 }
440 
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)441 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
442     CUfunction function, CUfunc_cache cache_config) {
443   RETURN_IF_CUDA_RES_ERROR(cuFuncSetCacheConfig(function, cache_config),
444                            "Failed to set CUDA kernel cache config");
445   return port::Status::OK();
446 }
447 
448 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)449 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
450   CUsharedconfig shared_mem_config;
451   ScopedActivateContext activation(context);
452   RETURN_IF_CUDA_RES_ERROR(cuCtxGetSharedMemConfig(&shared_mem_config),
453                            "Failed to get shared memory config");
454   return shared_mem_config;
455 }
456 
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)457 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
458     GpuContext* context, CUsharedconfig shared_mem_config) {
459   ScopedActivateContext activation(context);
460   RETURN_IF_CUDA_RES_ERROR(cuCtxSetSharedMemConfig(shared_mem_config),
461                            "Failed to set shared memory config");
462   return port::Status::OK();
463 }
464 
LaunchKernel(GpuContext * context,absl::string_view kernel_name,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)465 /* static */ port::Status GpuDriver::LaunchKernel(
466     GpuContext* context, absl::string_view kernel_name, CUfunction function,
467     unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
468     unsigned int block_dim_x, unsigned int block_dim_y,
469     unsigned int block_dim_z, unsigned int shared_mem_bytes, CUstream stream,
470     void** kernel_params, void** extra) {
471   ScopedActivateContext activation(context);
472   VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
473           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
474           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
475           << " bdz: " << block_dim_z;
476   RETURN_IF_CUDA_RES_ERROR(
477       cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
478                      block_dim_y, block_dim_z, shared_mem_bytes, stream,
479                      kernel_params, extra),
480       "Failed to launch CUDA kernel: ", kernel_name,
481       " with block dimensions: ", block_dim_x, "x", block_dim_y, "x",
482       block_dim_z, " and grid dimensions: ", grid_dim_x, "x", grid_dim_y, "x",
483       grid_dim_z);
484   return port::Status::OK();
485 }
486 
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)487 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
488                                                const char* cubin_bytes,
489                                                CUmodule* module) {
490   ScopedActivateContext activation(context);
491   RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
492                            "Failed to load in-memory CUBIN");
493   return port::Status::OK();
494 }
495 
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)496 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
497                                              const char* ptx_contents,
498                                              CUmodule* module) {
499   absl::Notification notification;
500   port::Status ret = port::Status::OK();
501   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
502                                  &notification]() {
503     ScopedActivateContext activation(context);
504     void* ptx_data = const_cast<char*>(ptx_contents);
505     static const unsigned int kLogBufferBytesLimit = 1024;
506     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
507     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
508     absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
509     absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
510     bool log_verbose = true;
511     CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
512                               CU_JIT_ERROR_LOG_BUFFER,
513                               CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
514                               CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
515     // Note that the driver API wants the contents of this values to be stored
516     // in an array of void*s, so we coerce them accordingly.
517     void* option_values[] = {
518         absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
519         absl::bit_cast<void*>(error_log_buffer.data()),
520         absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
521         absl::bit_cast<void*>(info_log_buffer.data()),
522         absl::bit_cast<void*>(uintptr_t(log_verbose))};
523     CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
524 
525     CUresult res;
526     {
527       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
528       // module loading: see http://b/13248943
529       absl::LeakCheckDisabler disabler;
530       res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
531                                option_values);
532     }
533 
534     // The PTX JIT mutates the values in the option values array to reflect the
535     // size of the logs it output; now that we've made the call, read the values
536     // back out.
537     error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
538     info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
539     CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
540     CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
541 
542     if (res != CUDA_SUCCESS) {
543       LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
544       // As a precaution for null termination of the API-provided value, ensure
545       // that at least the last byte is null.
546       error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
547                                               : 0] = '\0';
548       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
549                  << " bytes): " << error_log_buffer.data();
550       ret = port::InternalError(
551           absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
552       notification.Notify();
553     }
554 
555     VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
556             << " bytes): " << info_log_buffer.data();
557     VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
558             << " bytes): " << error_log_buffer.data();
559     CHECK(module != nullptr);
560     notification.Notify();
561   });
562   notification.WaitForNotification();
563 
564   return ret;
565 }
566 
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)567 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
568                                                const char* hsaco_contents,
569                                                CUmodule* module) {
570   return port::InternalError(
571       "Feature not supported on CUDA platform (LoadHsaco)");
572 }
573 
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)574 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
575     GpuContext* context, CUdeviceptr location, uint8 value, size_t size) {
576   ScopedActivateContext activation(context);
577   RETURN_IF_CUDA_RES_ERROR(cuMemsetD8(location, value, size),
578                            "Failed to memset memory");
579   return port::Status::OK();
580 }
581 
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)582 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
583     GpuContext* context, CUdeviceptr location, uint32 value,
584     size_t uint32_count) {
585   ScopedActivateContext activation(context);
586   RETURN_IF_CUDA_RES_ERROR(cuMemsetD32(location, value, uint32_count),
587                            "Failed to memset memory");
588   return port::Status::OK();
589 }
590 
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)591 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
592     GpuContext* context, CUdeviceptr location, uint8 value, size_t uint32_count,
593     CUstream stream) {
594   ScopedActivateContext activation(context);
595   RETURN_IF_CUDA_RES_ERROR(
596       cuMemsetD8Async(location, value, uint32_count, stream),
597       "Failed to enqueue async memset operation");
598   return port::Status::OK();
599 }
600 
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)601 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
602     GpuContext* context, CUdeviceptr location, uint32 value,
603     size_t uint32_count, CUstream stream) {
604   ScopedActivateContext activation(context);
605   RETURN_IF_CUDA_RES_ERROR(
606       cuMemsetD32Async(location, value, uint32_count, stream),
607       "Failed to enqueue async memset operation");
608   return port::Status::OK();
609 }
610 
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)611 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
612                                                CUstream stream,
613                                                StreamCallback callback,
614                                                void* data) {
615   // Note: flags param is required to be zero according to CUDA 6.0.
616   CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
617   if (res != CUDA_SUCCESS) {
618     LOG(ERROR) << "unable to add host callback: " << ToString(res);
619     return false;
620   }
621   return true;
622 }
623 
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)624 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
625                                                CUmodule module,
626                                                const char* kernel_name,
627                                                CUfunction* function) {
628   ScopedActivateContext activated{context};
629   CHECK(module != nullptr && kernel_name != nullptr);
630   CUresult res = cuModuleGetFunction(function, module, kernel_name);
631   if (res != CUDA_SUCCESS) {
632     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
633                << "\" from module: " << ToString(res);
634     return false;
635   }
636 
637   return true;
638 }
639 
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)640 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
641                                              CUmodule module,
642                                              const char* symbol_name,
643                                              CUdeviceptr* dptr, size_t* bytes) {
644   ScopedActivateContext activated{context};
645   CHECK(module != nullptr && symbol_name != nullptr &&
646         (dptr != nullptr || bytes != nullptr));
647   CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
648   if (res != CUDA_SUCCESS) {
649     // symbol may not be found in the current module, but it may reside in
650     // another module.
651     VLOG(2) << "failed to get symbol \"" << symbol_name
652             << "\" from module: " << ToString(res);
653     return false;
654   }
655 
656   return true;
657 }
658 
UnloadModule(GpuContext * context,CUmodule module)659 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
660                                           CUmodule module) {
661   ScopedActivateContext activated{context};
662   CUresult res = cuModuleUnload(module);
663   if (res != CUDA_SUCCESS) {
664     LOG(ERROR) << "failed to unload module " << module
665                << "; leaking: " << ToString(res);
666   }
667 }
668 
DeviceFromContext(GpuContext * context)669 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
670     GpuContext* context) {
671   ScopedActivateContext activated{context};
672   CUdevice device = -1;
673   CUresult result = cuCtxGetDevice(&device);
674   if (result == CUDA_SUCCESS) {
675     return device;
676   }
677 
678   return port::Status(
679       port::error::INTERNAL,
680       absl::StrCat("failed to get device for context: ", ToString(result)));
681 }
682 
CreateStream(GpuContext * context,CUstream * stream,int priority)683 /* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
684                                           int priority) {
685   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
686   // up synchronization with respect to memsets and any other things that have
687   // to occur on the default stream?
688   ScopedActivateContext activated{context};
689   CUresult res;
690   // If the priority is 0, then use the previous api to create the stream with
691   // the default priority for backward compatibility. Probably there is no
692   // difference in using the new api call but leaving it as is for now.
693   if (priority == 0) {
694     res = cuStreamCreate(stream, 0);
695   } else {
696     res = cuStreamCreateWithPriority(stream, 0, priority);
697   }
698   if (res != CUDA_SUCCESS) {
699     LOG(ERROR) << "could not allocate CUDA stream for context "
700                << context->context() << ": " << ToString(res);
701     return false;
702   }
703 
704   VLOG(2) << "successfully created stream " << *stream << " for context "
705           << context->context() << " on thread";
706   return true;
707 }
708 
DestroyStream(GpuContext * context,CUstream * stream)709 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
710                                            CUstream* stream) {
711   if (*stream == nullptr) {
712     return;
713   }
714 
715   ScopedActivateContext activated{context};
716   CUresult res = cuStreamDestroy(*stream);
717   if (res != CUDA_SUCCESS) {
718     LOG(ERROR) << "failed to destroy CUDA stream for context "
719                << context->context() << ": " << ToString(res);
720   } else {
721     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
722             << context->context();
723     *stream = nullptr;
724   }
725 }
726 
DeviceAllocate(GpuContext * context,uint64 bytes)727 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
728                                              uint64 bytes) {
729   if (bytes == 0) {
730     return nullptr;
731   }
732 
733   ScopedActivateContext activated{context};
734   CUdeviceptr result = 0;
735   CUresult res = cuMemAlloc(&result, bytes);
736   if (res != CUDA_SUCCESS) {
737     // LOG(INFO) because this isn't always important to users (e.g. BFCAllocator
738     // implements a retry if the first allocation fails).
739     LOG(INFO) << "failed to allocate "
740               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
741               << " bytes) from device: " << ToString(res);
742     return nullptr;
743   }
744   void* ptr = reinterpret_cast<void*>(result);
745   VLOG(2) << "allocated " << ptr << " for context " << context->context()
746           << " of " << bytes << " bytes";
747   return ptr;
748 }
749 
DeviceDeallocate(GpuContext * context,void * location)750 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
751                                               void* location) {
752   ScopedActivateContext activation(context);
753   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
754   CUresult res = cuMemFree(pointer);
755   if (res != CUDA_SUCCESS) {
756     LOG(ERROR) << "failed to free device memory at " << location
757                << "; result: " << ToString(res);
758   } else {
759     VLOG(2) << "deallocated " << location << " for context "
760             << context->context();
761   }
762 }
763 
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)764 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
765                                                     uint64 bytes) {
766   ScopedActivateContext activation(context);
767   CUdeviceptr result = 0;
768   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
769   CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
770   if (res != CUDA_SUCCESS) {
771     LOG(ERROR) << "failed to alloc " << bytes
772                << " bytes unified memory; result: " << ToString(res);
773     return nullptr;
774   }
775   void* ptr = reinterpret_cast<void*>(result);
776   VLOG(2) << "allocated " << ptr << " for context " << context->context()
777           << " of " << bytes << " bytes in unified memory";
778   return ptr;
779 }
780 
UnifiedMemoryDeallocate(GpuContext * context,void * location)781 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
782                                                      void* location) {
783   ScopedActivateContext activation(context);
784   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
785   CUresult res = cuMemFree(pointer);
786   if (res != CUDA_SUCCESS) {
787     LOG(ERROR) << "failed to free unified memory at " << location
788                << "; result: " << ToString(res);
789   } else {
790     VLOG(2) << "deallocated unified memory at " << location << " for context "
791             << context->context();
792   }
793 }
794 
HostAllocate(GpuContext * context,uint64 bytes)795 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
796   ScopedActivateContext activation(context);
797   void* host_mem = nullptr;
798   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
799   CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
800   if (res != CUDA_SUCCESS) {
801     LOG(ERROR) << "failed to alloc " << bytes
802                << " bytes on host: " << ToString(res);
803   }
804   return host_mem;
805 }
806 
HostDeallocate(GpuContext * context,void * location)807 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
808                                             void* location) {
809   ScopedActivateContext activation(context);
810   CUresult res = cuMemFreeHost(location);
811   if (res != CUDA_SUCCESS) {
812     LOG(ERROR) << "error deallocating host memory at " << location << ": "
813                << ToString(res);
814   }
815 }
816 
HostRegister(GpuContext * context,void * location,uint64 bytes)817 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
818                                           uint64 bytes) {
819   ScopedActivateContext activation(context);
820   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
821   CUresult res =
822       cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
823   if (res != CUDA_SUCCESS) {
824     LOG(ERROR) << "error registering host memory at " << location << ": "
825                << ToString(res);
826     return false;
827   }
828   return true;
829 }
830 
HostUnregister(GpuContext * context,void * location)831 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
832                                             void* location) {
833   ScopedActivateContext activation(context);
834   CUresult res = cuMemHostUnregister(location);
835   if (res != CUDA_SUCCESS) {
836     LOG(ERROR) << "error unregistering host memory at " << location << ": "
837                << ToString(res);
838     return false;
839   }
840   return true;
841 }
842 
843 #if CUDA_VERSION >= 10020
844 /* static */ port::StatusOr<GpuDriver::VmemSpan>
ReserveVirtualMemory(GpuContext * context,uint64 bytes)845 GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
846   ScopedActivateContext activation(context);
847   CUdeviceptr base;
848   CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
849                                      /*addr=*/0, /*flags=*/0);
850   if (res != CUDA_SUCCESS) {
851     return port::InternalError(
852         absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
853                         bytes, ToString(res)));
854   }
855   return {{base, bytes}};
856 }
857 
FreeVirtualMemory(GpuContext * context,GpuDriver::VmemSpan reservation)858 /* static */ void GpuDriver::FreeVirtualMemory(
859     GpuContext* context, GpuDriver::VmemSpan reservation) {
860   ScopedActivateContext activation(context);
861   CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
862   if (res != CUDA_SUCCESS) {
863     LOG(ERROR) << "error freeing vmem reservation of size "
864                << reservation.size_bytes << " at address " << reservation.base;
865   }
866 }
867 
GetMinAllocationGranularity(GpuDeviceHandle device)868 /* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
869     GpuDeviceHandle device) {
870   CUmemAllocationProp props = {};
871   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
872   props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
873   props.location.id = device;
874 
875   size_t granularity;
876   CUresult res = cuMemGetAllocationGranularity(
877       &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
878   if (res != CUDA_SUCCESS) {
879     return port::InternalError(absl::StrCat(
880         "failed to get min allocation granularity: ", ToString(res)));
881   }
882   return granularity;
883 }
884 
885 /* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
CreateMemoryHandle(GpuContext * context,uint64 bytes)886 GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
887   ScopedActivateContext activation(context);
888   auto device = DeviceFromContext(context);
889   if (!device.ok()) {
890     LOG(ERROR) << "Failed to get device from context" << device.status();
891     return device.status();
892   }
893 
894   CUmemAllocationProp props = {};
895   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
896   props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
897   props.location.id = device.ValueOrDie();
898 
899   CUmemGenericAllocationHandle mem_handle;
900   CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
901   if (res != CUDA_SUCCESS) {
902     return port::InternalError(
903         absl::StrFormat("failed to create memory allocation of size %d: %s",
904                         bytes, ToString(res)));
905   }
906   return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
907 }
908 
ReleaseMemoryHandle(GpuContext * context,GpuDriver::GenericMemoryHandle handle)909 /* static */ void GpuDriver::ReleaseMemoryHandle(
910     GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
911   ScopedActivateContext activation(context);
912 
913   CUresult res = cuMemRelease(handle.handle);
914   if (res != CUDA_SUCCESS) {
915     LOG(ERROR) << "Failed to release memory handle " << handle.handle
916                << " of size " << handle.bytes << ": " << ToString(res);
917   }
918 }
919 
MapMemory(GpuContext * context,CUdeviceptr va,const GpuDriver::GenericMemoryHandle & handle,const std::vector<GpuDeviceHandle> & device_handles)920 /* static */ port::Status GpuDriver::MapMemory(
921     GpuContext* context, CUdeviceptr va,
922     const GpuDriver::GenericMemoryHandle& handle,
923     const std::vector<GpuDeviceHandle>& device_handles) {
924   ScopedActivateContext activation(context);
925 
926   auto device = DeviceFromContext(context);
927   if (!device.ok()) {
928     return device.status();
929   }
930 
931   // NB: Zero is the only valid value for both flags and offset.
932   CUresult res =
933       cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
934   if (res != CUDA_SUCCESS) {
935     return port::InternalError(absl::StrFormat(
936         "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
937   }
938 
939   std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
940   for (int i = 0; i < access_descriptors.size(); ++i) {
941     access_descriptors[i].location.id = device_handles[i];
942     access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
943     access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
944   }
945 
946   res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
947                        access_descriptors.size());
948   if (res != CUDA_SUCCESS) {
949     // Unmap the memory that we failed to set access for.
950     if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
951       LOG(ERROR)
952           << "Failed to unmap memory in GpuDriver::MapMemory error path.";
953     }
954     return port::InternalError(absl::StrFormat(
955         "Failed to set read/write access on memory mapped at %d: %s", va,
956         ToString(res)));
957   }
958   return port::Status::OK();
959 }
960 
UnmapMemory(GpuContext * context,CUdeviceptr va,uint64 bytes)961 /* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
962                                          uint64 bytes) {
963   ScopedActivateContext activation(context);
964 
965   CUresult res = cuMemUnmap(va, bytes);
966   if (res != CUDA_SUCCESS) {
967     LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
968                << ": " << ToString(res);
969   }
970 }
971 
972 #endif
973 
DestroyEvent(GpuContext * context,CUevent * event)974 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
975                                                   CUevent* event) {
976   if (*event == nullptr) {
977     return port::Status(port::error::INVALID_ARGUMENT,
978                         "input event cannot be null");
979   }
980 
981   ScopedActivateContext activated{context};
982   RETURN_IF_CUDA_RES_ERROR(cuEventDestroy(*event),
983                            "Error destroying CUDA event");
984   return port::Status::OK();
985 }
986 
RecordEvent(GpuContext * context,CUevent event,CUstream stream)987 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
988                                                  CUevent event,
989                                                  CUstream stream) {
990   ScopedActivateContext activated{context};
991   RETURN_IF_CUDA_RES_ERROR(cuEventRecord(event, stream),
992                            "Error recording CUDA event");
993   return port::Status::OK();
994 }
995 
QueryEvent(GpuContext * context,CUevent event)996 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
997                                                             CUevent event) {
998   ScopedActivateContext activated{context};
999   CUresult res = cuEventQuery(event);
1000   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
1001     return port::Status(
1002         port::error::INTERNAL,
1003         absl::StrFormat("failed to query event: %s", ToString(res)));
1004   }
1005 
1006   return res;
1007 }
1008 
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1009 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1010                                                  float* elapsed_milliseconds,
1011                                                  CUevent start, CUevent stop) {
1012   ScopedActivateContext activated{context};
1013   // The stop event must have completed in order for cuEventElapsedTime to
1014   // work.
1015   CUresult res = cuEventSynchronize(stop);
1016   if (res != CUDA_SUCCESS) {
1017     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1018     return false;
1019   }
1020   res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1021   if (res != CUDA_SUCCESS) {
1022     LOG(ERROR) << "failed to get elapsed time between events: "
1023                << ToString(res);
1024     return false;
1025   }
1026 
1027   return true;
1028 }
1029 
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1030 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1031                                                CUstream stream, CUevent event) {
1032   ScopedActivateContext activation(context);
1033   CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1034   if (res != CUDA_SUCCESS) {
1035     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1036     return false;
1037   }
1038 
1039   return true;
1040 }
1041 
SynchronizeContext(GpuContext * context)1042 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1043   ScopedActivateContext activation(context);
1044   CUresult res = cuCtxSynchronize();
1045   if (res != CUDA_SUCCESS) {
1046     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1047                << " :: " << port::CurrentStackTrace();
1048     return false;
1049   }
1050 
1051   return true;
1052 }
1053 
SynchronizeStream(GpuContext * context,CUstream stream)1054 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1055                                                        CUstream stream) {
1056   ScopedActivateContext activated{context};
1057   CHECK(stream != nullptr);
1058   RETURN_IF_CUDA_RES_ERROR(cuStreamSynchronize(stream),
1059                            "Could not synchronize CUDA stream");
1060   return port::Status::OK();
1061 }
1062 
IsStreamIdle(GpuContext * context,CUstream stream)1063 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1064                                           CUstream stream) {
1065   ScopedActivateContext activated{context};
1066   CHECK(stream != nullptr);
1067   CUresult res = cuStreamQuery(stream);
1068   if (res == CUDA_SUCCESS) {
1069     return true;
1070   }
1071 
1072   if (res != CUDA_ERROR_NOT_READY) {
1073     LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1074   }
1075   return false;
1076 }
1077 
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size)1078 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1079                                                           void* host_dst,
1080                                                           CUdeviceptr gpu_src,
1081                                                           uint64 size) {
1082   ScopedActivateContext activation(context);
1083   RETURN_IF_CUDA_RES_ERROR(
1084       cuMemcpyDtoH(host_dst, gpu_src, size),
1085       absl::StrFormat("failed to synchronous memcpy from device to host "
1086                       "host dst: %p; GPU src: %p; size: %u=0x%x",
1087                       host_dst, absl::bit_cast<void*>(gpu_src), size, size));
1088   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1089           << host_dst;
1090   return port::Status::OK();
1091 }
1092 
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size)1093 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1094                                                           CUdeviceptr gpu_dst,
1095                                                           const void* host_src,
1096                                                           uint64 size) {
1097   ScopedActivateContext activation(context);
1098   RETURN_IF_CUDA_RES_ERROR(
1099       cuMemcpyHtoD(gpu_dst, host_src, size),
1100       absl::StrFormat(
1101           "failed to synchronous memcpy from host to device: GPU dst: %p;"
1102           " host src: %p; size: %u=0x%x",
1103           absl::bit_cast<void*>(gpu_dst), host_src, size, size));
1104   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1105   return port::Status::OK();
1106 }
1107 
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size)1108 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1109                                                           CUdeviceptr gpu_dst,
1110                                                           CUdeviceptr gpu_src,
1111                                                           uint64 size) {
1112   ScopedActivateContext activation(context);
1113 
1114   CUresult result;
1115   // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1116   // This happens when the size is 0.
1117   if (gpu_dst == 0 || gpu_src == 0 || !UseCudaMallocAsyncAllocator()) {
1118     result = cuMemcpyDtoD(gpu_dst, gpu_src, size);
1119   } else {
1120     // Any context work here.
1121     CUcontext dst_context =
1122         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1123     CUcontext src_context =
1124         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1125 
1126     if (static_cast<void*>(dst_context) == nullptr) {
1127       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1128       if (tmp_context.ok()) {
1129         dst_context = tmp_context.ValueOrDie()->context();
1130       }
1131     }
1132 
1133     if (static_cast<void*>(src_context) == nullptr) {
1134       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1135       if (tmp_context.ok()) {
1136         src_context = tmp_context.ValueOrDie()->context();
1137       }
1138     }
1139 
1140     result = cuMemcpyPeer(gpu_dst, dst_context, gpu_src, src_context, size);
1141   }
1142 
1143   RETURN_IF_CUDA_RES_ERROR(
1144       result,
1145       absl::StrFormat(
1146           "failed to synchronous memcpy from host to device: GPU dst: %p; "
1147           "GPU src: %p; size: %u=0x%x",
1148           absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
1149           size));
1150   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1151   return port::Status::OK();
1152 }
1153 
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1154 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1155                                                    void* host_dst,
1156                                                    CUdeviceptr gpu_src,
1157                                                    uint64 size,
1158                                                    CUstream stream) {
1159   ScopedActivateContext activation(context);
1160   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1161   if (res != CUDA_SUCCESS) {
1162     LOG(ERROR) << absl::StrFormat(
1163         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1164         "GPU src: %p; size: %u=0x%x",
1165         ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size);
1166     return false;
1167   }
1168   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1169           << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1170           << host_dst << " on stream " << stream;
1171   return true;
1172 }
1173 
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size,CUstream stream)1174 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1175                                                    CUdeviceptr gpu_dst,
1176                                                    const void* host_src,
1177                                                    uint64 size,
1178                                                    CUstream stream) {
1179   ScopedActivateContext activation(context);
1180   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1181   if (res != CUDA_SUCCESS) {
1182     LOG(ERROR) << absl::StrFormat(
1183         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1184         "host src: %p; size: %u=0x%x",
1185         ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size);
1186     return false;
1187   }
1188   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1189           << " on stream " << stream;
1190   return true;
1191 }
1192 
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1193 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1194                                                    CUdeviceptr gpu_dst,
1195                                                    CUdeviceptr gpu_src,
1196                                                    uint64 size,
1197                                                    CUstream stream) {
1198   ScopedActivateContext activation(context);
1199   CUresult result;
1200   // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1201   // This happens when the size is 0.
1202   if (gpu_dst == 0 || gpu_src == 0 || !UseCudaMallocAsyncAllocator()) {
1203     result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1204   } else {
1205     // Any context work here.
1206     CUcontext dst_context =
1207         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1208     CUcontext src_context =
1209         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1210 
1211     if (static_cast<void*>(dst_context) == nullptr) {
1212       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1213       if (tmp_context.ok()) {
1214         dst_context = tmp_context.ValueOrDie()->context();
1215       }
1216     }
1217 
1218     if (static_cast<void*>(src_context) == nullptr) {
1219       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1220       if (tmp_context.ok()) {
1221         src_context = tmp_context.ValueOrDie()->context();
1222       }
1223     }
1224 
1225     result = cuMemcpyPeerAsync(gpu_dst, dst_context, gpu_src, src_context, size,
1226                                stream);
1227   }
1228   if (result != CUDA_SUCCESS) {
1229     LOG(ERROR) << absl::StrFormat(
1230         "failed to enqueue async memcpy from device to device: %s"
1231         "; GPU dst: %p on %s %s"
1232         "; GPU src: %p on %s %s"
1233         "; can access? %s; size: %u=0x%x",
1234         ToString(result), absl::bit_cast<void*>(gpu_dst),
1235         CUDAPointerToMemorySpaceString(gpu_dst),
1236         CUDAPointerToDeviceString(gpu_dst), absl::bit_cast<void*>(gpu_src),
1237         CUDAPointerToMemorySpaceString(gpu_src),
1238         CUDAPointerToDeviceString(gpu_src),
1239         CUDAPointersToCanAccessString(gpu_src, gpu_dst), size, size);
1240 
1241     return false;
1242   }
1243   VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1244   return true;
1245 }
1246 
InitEvent(GpuContext * context,CUevent * result,EventFlags flags)1247 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
1248                                                CUevent* result,
1249                                                EventFlags flags) {
1250   int cuflags;
1251   switch (flags) {
1252     case EventFlags::kDefault:
1253       cuflags = CU_EVENT_DEFAULT;
1254       break;
1255     case EventFlags::kDisableTiming:
1256       cuflags = CU_EVENT_DISABLE_TIMING;
1257       break;
1258     default:
1259       LOG(FATAL) << "impossible event flags: " << int(flags);
1260   }
1261 
1262   ScopedActivateContext activated{context};
1263   CUresult res = cuEventCreate(result, cuflags);
1264 
1265   if (res == CUDA_SUCCESS) {
1266     return port::Status::OK();
1267   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1268     return port::Status(port::error::RESOURCE_EXHAUSTED,
1269                         "could not create CUDA event: out of device memory");
1270   } else {
1271     return port::Status(
1272         port::error::FAILED_PRECONDITION,
1273         absl::StrCat("could not create CUDA event: ", ToString(res)));
1274   }
1275 }
1276 
GetDeviceCount()1277 /* static */ int GpuDriver::GetDeviceCount() {
1278   int device_count = 0;
1279   CUresult res = cuDeviceGetCount(&device_count);
1280   if (res != CUDA_SUCCESS) {
1281     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1282     return 0;
1283   }
1284 
1285   if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1286     device_count = 1;
1287   }
1288   return device_count;
1289 }
1290 
GetPointerContext(CUdeviceptr pointer)1291 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1292     CUdeviceptr pointer) {
1293   GpuContext* context = nullptr;
1294   CUresult result =
1295       cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1296   if (result == CUDA_SUCCESS) {
1297     // For cudaMallocAsync, the context returned is null.  For now
1298     // return not-available. But how to manage that correctly
1299     // everywhere in TF?  Currently this is only used during error
1300     // handling.  So all is working fine, but TF have a different
1301     // error then the original one.
1302     if (context == nullptr) {
1303       return port::Status(
1304           port::error::UNAVAILABLE,
1305           absl::StrCat("failed to query context for device pointer: ",
1306                        ToString(result)));
1307     }
1308     return context;
1309   }
1310 
1311   return port::Status(
1312       port::error::INTERNAL,
1313       absl::StrCat("failed to query context for device pointer: ",
1314                    ToString(result)));
1315 }
1316 
GetPointerMemorySpace(CUdeviceptr pointer)1317 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1318     CUdeviceptr pointer) {
1319   unsigned int value;
1320   CUresult result =
1321       cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1322   if (result == CUDA_SUCCESS) {
1323     switch (value) {
1324       case CU_MEMORYTYPE_DEVICE:
1325         return MemorySpace::kDevice;
1326       case CU_MEMORYTYPE_HOST:
1327         return MemorySpace::kHost;
1328       default:
1329         return port::Status(
1330             port::error::INTERNAL,
1331             absl::StrCat("unknown memory space provided by CUDA API: ", value));
1332     }
1333   }
1334 
1335   return port::Status(
1336       port::error::INTERNAL,
1337       absl::StrCat("failed to query device pointer for memory space: ",
1338                    ToString(result)));
1339 }
1340 
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1341 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1342                                                             CUdeviceptr* base,
1343                                                             size_t* size) {
1344   CUresult result = cuMemGetAddressRange(base, size, dptr);
1345   if (result == CUDA_SUCCESS) {
1346     return port::Status::OK();
1347   } else if (result == CUDA_ERROR_NOT_FOUND) {
1348     // We differentiate between "this pointer is unknown" (return here) and
1349     // "there was an internal error while performing this operation" (return
1350     // below).
1351     return port::Status(
1352         port::error::NOT_FOUND,
1353         absl::StrFormat("not a device pointer %p; %s",
1354                         reinterpret_cast<void*>(dptr), ToString(result)));
1355   }
1356 
1357   return port::Status(
1358       port::error::INTERNAL,
1359       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1360                       reinterpret_cast<void*>(dptr), ToString(result)));
1361 }
1362 
GetPointerDevice(CUdeviceptr pointer)1363 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1364     CUdeviceptr pointer) {
1365   auto result = GetPointerContext(pointer);
1366   if (!result.ok()) {
1367     return result.status();
1368   }
1369 
1370   return DeviceFromContext(result.ValueOrDie());
1371 }
1372 
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1373 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1374                                                           int* cc_minor,
1375                                                           CUdevice device) {
1376   *cc_major = 0;
1377   *cc_minor = 0;
1378 
1379   CUresult res = cuDeviceGetAttribute(
1380       cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1381   if (res != CUDA_SUCCESS) {
1382     return port::Status(
1383         port::error::INTERNAL,
1384         absl::StrFormat(
1385             "failed to get compute capability major for device: %s; %d",
1386             ToString(res), device));
1387   }
1388 
1389   res = cuDeviceGetAttribute(
1390       cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1391   if (res != CUDA_SUCCESS) {
1392     return port::Status(
1393         port::error::INTERNAL,
1394         absl::StrFormat(
1395             "failed to get compute capability minor for device: %s; %d",
1396             ToString(res), device));
1397   }
1398 
1399   return port::Status::OK();
1400 }
1401 
GetGpuISAVersion(int * version,CUdevice device)1402 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1403                                                       CUdevice device) {
1404   return port::Status{
1405       port::error::INTERNAL,
1406       "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1407 }
1408 
GetGpuGCNArchName(CUdevice,std::string *)1409 /* static */ port::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
1410   return port::Status{
1411       port::error::INTERNAL,
1412       "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
1413 }
1414 
1415 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1416 // T and wraps it in a StatusOr.
1417 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1418 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1419                                             CUdevice_attribute attribute) {
1420   int value = -1;
1421   RETURN_IF_CUDA_RES_ERROR(cuDeviceGetAttribute(&value, attribute, device),
1422                            "Could not retrieve CUDA device attribute (",
1423                            attribute);
1424   T converted = value;
1425   return converted;
1426 }
1427 
GetMultiprocessorCount(CUdevice device)1428 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1429     CUdevice device) {
1430   return GetSimpleAttribute<int>(device,
1431                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1432 }
1433 
GetMaxSharedMemoryPerCore(CUdevice device)1434 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1435     CUdevice device) {
1436   return GetSimpleAttribute<int64>(
1437       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1438 }
1439 
GetMaxSharedMemoryPerBlock(CUdevice device)1440 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1441     CUdevice device) {
1442   return GetSimpleAttribute<int64>(
1443       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1444 }
1445 
GetMaxThreadsPerMultiprocessor(CUdevice device)1446 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1447     CUdevice device) {
1448   return GetSimpleAttribute<int64>(
1449       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1450 }
1451 
GetMaxThreadsPerBlock(CUdevice device)1452 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1453     CUdevice device) {
1454   return GetSimpleAttribute<int64>(device,
1455                                    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1456 }
1457 
GetMaxRegistersPerBlock(CUdevice device)1458 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1459     CUdevice device) {
1460   return GetSimpleAttribute<int64>(device,
1461                                    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1462 }
1463 
GetThreadsPerWarp(CUdevice device)1464 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1465     CUdevice device) {
1466   return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1467 }
1468 
GetGridLimits(int * x,int * y,int * z,CUdevice device)1469 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1470                                            CUdevice device) {
1471   int value;
1472   CUresult res =
1473       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1474   if (res != CUDA_SUCCESS) {
1475     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1476     return false;
1477   }
1478   *x = value;
1479 
1480   res =
1481       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1482   if (res != CUDA_SUCCESS) {
1483     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1484     return false;
1485   }
1486   *y = value;
1487 
1488   res =
1489       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1490   if (res != CUDA_SUCCESS) {
1491     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1492     return false;
1493   }
1494   *z = value;
1495   return true;
1496 }
1497 
GetDriverVersion(int * driver_version)1498 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1499   CUresult res = cuDriverGetVersion(driver_version);
1500   if (res != CUDA_SUCCESS) {
1501     LOG(ERROR) << "failed to query driver version: " << ToString(res);
1502     return false;
1503   }
1504 
1505   return true;
1506 }
1507 
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1508 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1509                                                  int device_ordinal) {
1510   CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1511   if (res != CUDA_SUCCESS) {
1512     LOG(ERROR) << "failed to query device properties: " << ToString(res);
1513     return false;
1514   }
1515 
1516   return true;
1517 }
1518 
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1519 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1520     CUdevice_attribute attribute, CUdevice device) {
1521   int val;
1522   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1523   if (res != CUDA_SUCCESS) {
1524     return port::Status(
1525         port::error::INTERNAL,
1526         absl::StrFormat("failed to get device attribute %d for device %d: %s",
1527                         attribute, device, ToString(res)));
1528   }
1529   return val;
1530 }
1531 
IsEccEnabled(CUdevice device,bool * result)1532 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1533   int value = -1;
1534   CUresult res =
1535       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1536   if (res != CUDA_SUCCESS) {
1537     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1538     return false;
1539   }
1540 
1541   *result = value;
1542   return true;
1543 }
1544 
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1545 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1546                                                  int64* free_out,
1547                                                  int64* total_out) {
1548   ScopedActivateContext activation(context);
1549   size_t free = 0;
1550   size_t total = 0;
1551   CUresult res = cuMemGetInfo(&free, &total);
1552   if (res != CUDA_SUCCESS) {
1553     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1554     return false;
1555   }
1556 
1557   *free_out = free;
1558   *total_out = total;
1559   return true;
1560 }
1561 
GetDeviceTotalMemory(CUdevice device,uint64 * result)1562 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1563                                                   uint64* result) {
1564   size_t value = -1;
1565   CUresult res = cuDeviceTotalMem(&value, device);
1566   if (res != CUDA_SUCCESS) {
1567     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1568     return false;
1569   }
1570 
1571   *result = value;
1572   return true;
1573 }
1574 
GetPCIBusID(CUdevice device)1575 /* static */ std::string GpuDriver::GetPCIBusID(CUdevice device) {
1576   std::string pci_bus_id;
1577   static const int kBufferSize = 64;
1578   absl::InlinedVector<char, 4> chars(kBufferSize);
1579   chars[kBufferSize - 1] = '\0';
1580   CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1581   if (res != CUDA_SUCCESS) {
1582     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1583     return pci_bus_id;
1584   }
1585   pci_bus_id = chars.begin();
1586   return pci_bus_id;
1587 }
1588 
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1589 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1590                                                  GpuContext* to) {
1591   if (from == to) {
1592     return true;  // A context can always access its own memory.
1593   }
1594 
1595   auto from_device = DeviceFromContext(from);
1596   if (!from_device.ok()) {
1597     LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1598                << from_device.status();
1599     return false;
1600   }
1601   auto to_device = DeviceFromContext(to);
1602   if (!to_device.ok()) {
1603     LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1604                << to_device.status();
1605     return false;
1606   }
1607   return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
1608 }
1609 
CanEnablePeerAccess(GpuDeviceHandle from,GpuDeviceHandle to)1610 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
1611                                                  GpuDeviceHandle to) {
1612   int can_access_peer = -1;
1613   CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
1614   if (result != CUDA_SUCCESS) {
1615     LOG(ERROR) << "failed to detect peer access capability: "
1616                << ToString(result);
1617     return false;
1618   }
1619   return can_access_peer;
1620 }
1621 
EnablePeerAccess(GpuContext * from,GpuContext * to)1622 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1623                                                       GpuContext* to) {
1624   if (from == to) {
1625     return port::Status::OK();  // A context can always access its own memory.
1626   }
1627 
1628   ScopedActivateContext activated{from};
1629   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1630   if (result != CUDA_SUCCESS &&
1631       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1632     return port::Status(
1633         port::error::INTERNAL,
1634         absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
1635                         to, ToString(result)));
1636   }
1637 
1638   return port::Status::OK();
1639 }
1640 
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1641 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1642     GpuContext* context, CUfunction kernel, int threads_per_block,
1643     size_t dynamic_shared_memory_bytes) {
1644   ScopedActivateContext activation(context);
1645 
1646   int max_blocks;
1647   RETURN_IF_CUDA_RES_ERROR(
1648       cuOccupancyMaxActiveBlocksPerMultiprocessor(
1649           &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes),
1650       absl::StrFormat("Failed to calculate occupancy of kernel %p", kernel));
1651   return max_blocks;
1652 }
1653 
1654 }  // namespace gpu
1655 
1656 namespace cuda {
1657 
CurrentContextOrDie()1658 CUcontext CurrentContextOrDie() {
1659   CUcontext current = nullptr;
1660   FAIL_IF_CUDA_RES_ERROR(cuCtxGetCurrent(&current),
1661                          "Failed to query current context");
1662   return current;
1663 }
1664 
1665 }  // namespace cuda
1666 }  // namespace stream_executor
1667