1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
17
18 #include <stdint.h>
19 #include <stdlib.h>
20
21 #include <map>
22 #include <set>
23 #include <utility>
24
25 #include "absl/base/casts.h"
26 #include "absl/base/const_init.h"
27 #include "absl/container/inlined_vector.h"
28 #include "absl/debugging/leak_check.h"
29 #include "absl/memory/memory.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/synchronization/mutex.h"
33 #include "absl/synchronization/notification.h"
34 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
35 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
36 #include "tensorflow/stream_executor/lib/env.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/human_readable.h"
39 #include "tensorflow/stream_executor/lib/stacktrace.h"
40 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
41 #include "tensorflow/stream_executor/lib/threadpool.h"
42 #include "tensorflow/stream_executor/platform/logging.h"
43 #include "tensorflow/stream_executor/platform/port.h"
44
45 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
46 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
47 bool FLAGS_gpuexec_cuda_device_0_only = false;
48
49 #define RETURN_IF_CUDA_RES_ERROR(expr, ...) \
50 do { \
51 CUresult _res = (expr); \
52 if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) { \
53 return port::InternalError(absl::StrCat( \
54 __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
55 } \
56 } while (0)
57
58 #define FAIL_IF_CUDA_RES_ERROR(expr, ...) \
59 do { \
60 CUresult _res = (expr); \
61 if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) { \
62 LOG(FATAL) << absl::StrCat(__VA_ARGS__) << ": " \
63 << ::stream_executor::gpu::ToString(_res); \
64 } \
65 } while (0)
66
67 // Debugging: on each push and pop of a cuda context, verify the current context
68 // matches the expected one.
69 constexpr bool kVerifyGpuContext = false;
70
71 namespace stream_executor {
72 namespace gpu {
73
74 /* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
75 /* static */ int64_t CreatedContexts::next_id_ = 1; // 0 means "no context"
76
77 namespace {
78
UseCudaMallocAsyncAllocator()79 bool UseCudaMallocAsyncAllocator() {
80 static const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
81 return debug_allocator_str != nullptr &&
82 std::strcmp(debug_allocator_str, "cuda_malloc_async") == 0;
83 }
84
85 // Returns the current context and checks that it is in the set of CUDA contexts
86 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
87 // context behind our backs).
CurrentContext()88 CUcontext CurrentContext() {
89 CUcontext current = cuda::CurrentContextOrDie();
90 if (current != nullptr && !CreatedContexts::Has(current)) {
91 LOG(FATAL) << "current context was not created by the StreamExecutor "
92 "cuda_driver API: "
93 << current
94 << "; a CUDA runtime call "
95 "was likely performed without using a StreamExecutor context";
96 }
97 return current;
98 }
99
100 // CUDA driver routines may require a large amount of stack (particularly
101 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
102 // stack-limited threads (such as those spawned by a default-argument
103 // thread::ThreadPool on some platforms), we run certain routines in this pool
104 // and wait for completion.
GetDriverExecutor()105 port::ThreadPool* GetDriverExecutor() {
106 static port::ThreadPool* thread_pool = new port::ThreadPool(
107 port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
108 return thread_pool;
109 }
110
111 } // namespace
112
MemorySpaceString(MemorySpace memory_space)113 std::string MemorySpaceString(MemorySpace memory_space) {
114 switch (memory_space) {
115 case MemorySpace::kHost:
116 return "host";
117 case MemorySpace::kDevice:
118 return "device";
119 default:
120 LOG(FATAL) << "impossible memory space";
121 }
122 }
123
124 namespace {
125
126 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()127 void SynchronizeOrDie() {
128 FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
129 "Synchronize fail: ", port::CurrentStackTrace());
130 }
131
132 struct ThreadLocalData {
133 int64 id;
134 GpuContext* context; // Only valid if id == a known good context.
135 int depth;
136 };
137
138 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
139
140 } // namespace
141
ScopedActivateContext(GpuContext * cuda_context)142 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
143 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
144
145 auto* tls = &tls_data.get();
146
147 // If this is an outermost scope, we must not assume that the CUDA context has
148 // been left in the same state we left it. Other code may have run on this
149 // thread and altered the context.
150 if (tls->depth == 0) {
151 VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
152 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
153 "Failed setting context");
154 tls->depth = 1;
155 tls->id = cuda_context->id();
156 tls->context = cuda_context;
157 to_restore_ = nullptr;
158 return;
159 }
160
161 tls->depth++;
162 if (tls->id == cuda_context->id()) {
163 if (kVerifyGpuContext) {
164 CHECK_EQ(CurrentContext(), cuda_context->context());
165 }
166 DCHECK_EQ(CurrentContext(), cuda_context->context());
167 return;
168 }
169
170 VLOG(3) << "ScopedActivateContext switching context from " << tls->id
171 << " to " << cuda_context->id();
172
173 to_restore_ = tls->context;
174 // Set the context and update thread local.
175 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
176 "Failed setting context");
177 tls->id = cuda_context->id();
178 tls->context = cuda_context;
179 }
180
~ScopedActivateContext()181 ScopedActivateContext::~ScopedActivateContext() {
182 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
183
184 auto* tls = &tls_data.get();
185
186 if (kVerifyGpuContext) {
187 // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
188 // possible this could fail in the CurrentContext() call.
189 CHECK_EQ(CurrentContext(),
190 tls->context == nullptr ? nullptr : tls->context->context());
191 }
192
193 tls->depth--;
194 DCHECK_GE(tls->depth, 0);
195 if (to_restore_ == nullptr) {
196 // Leave context, tls->id, and tls->context set.
197 return;
198 }
199
200 // Set context and update thread local.
201 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(to_restore_->context()),
202 "Failed setting context");
203 tls->id = to_restore_->id();
204 tls->context = to_restore_;
205 }
206
207 namespace {
208
209 // Returns a stringified device number associated with pointer, primarily for
210 // logging purposes. Returns "?" if the device could not be successfully
211 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)212 std::string CUDAPointerToDeviceString(CUdeviceptr pointer) {
213 auto value = GpuDriver::GetPointerDevice(pointer);
214 if (value.ok()) {
215 return absl::StrCat(value.ValueOrDie());
216 }
217 LOG(ERROR) << "could not query device: " << value.status();
218 return "?";
219 }
220
221 // Returns a stringified memory space associated with pointer, primarily for
222 // logging purposes. Returns "?" if the memory space could not be successfully
223 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)224 std::string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
225 auto value = GpuDriver::GetPointerMemorySpace(pointer);
226 if (value.ok()) {
227 return MemorySpaceString(value.ValueOrDie());
228 }
229 LOG(ERROR) << "could not query device: " << value.status();
230 return "?";
231 }
232
233 // Returns a stringified representation of whether or not peer access is
234 // permitted between the "from" and "to" pointers' associated contexts,
235 // primarily for logging purposes. Returns "error" if an error is encountered
236 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)237 std::string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
238 auto from_context = GpuDriver::GetPointerContext(from);
239 if (!from_context.ok()) {
240 LOG(ERROR) << "could not retrieve source pointer's context: "
241 << from_context.status();
242 return "error";
243 }
244 auto to_context = GpuDriver::GetPointerContext(to);
245 if (!to_context.ok()) {
246 LOG(ERROR) << "could not retrieve destination pointer's context: "
247 << to_context.status();
248 return "error";
249 }
250 return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
251 to_context.ValueOrDie())
252 ? "true"
253 : "false";
254 }
255
256 // Actually performs the work of CUDA initialization. Wrapped up in one-time
257 // execution guard.
InternalInit()258 static port::Status InternalInit() {
259 CUresult res = CUDA_ERROR_NO_DEVICE;
260 if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
261 LOG(ERROR) << "injecting CUDA init error; initialization will fail";
262 } else {
263 res = cuInit(0 /* = flags */);
264 }
265
266 if (res == CUDA_SUCCESS) {
267 return port::Status::OK();
268 } else if (res == CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) {
269 LOG(WARNING) << "failed call to cuInit: " << ToString(res);
270 } else {
271 LOG(ERROR) << "failed call to cuInit: " << ToString(res);
272 }
273
274 Diagnostician::LogDiagnosticInformation();
275 return port::Status(port::error::ABORTED,
276 absl::StrCat("failed call to cuInit: ", ToString(res)));
277 }
278
279 } // namespace
280
Init()281 /* static */ port::Status GpuDriver::Init() {
282 // Cached return value from calling InternalInit(), as cuInit need only be
283 // called once, but GpuDriver::Init may be called many times.
284 static port::Status* init_retval = [] {
285 return new port::Status(InternalInit());
286 }();
287 return *init_retval;
288 }
289
GetDevice(int device_ordinal,CUdevice * device)290 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
291 CUdevice* device) {
292 RETURN_IF_CUDA_RES_ERROR(cuDeviceGet(device, device_ordinal),
293 "Failed call to cuDeviceGet");
294 return port::Status::OK();
295 }
296
GetDeviceName(CUdevice device,std::string * device_name)297 /* static */ port::Status GpuDriver::GetDeviceName(CUdevice device,
298 std::string* device_name) {
299 static const size_t kCharLimit = 64;
300 absl::InlinedVector<char, 4> chars(kCharLimit);
301 RETURN_IF_CUDA_RES_ERROR(
302 cuDeviceGetName(chars.begin(), kCharLimit - 1, device),
303 "Failed to get device name");
304 chars[kCharLimit - 1] = '\0';
305 *device_name = chars.begin();
306 return port::Status::OK();
307 }
308
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)309 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
310 int* flags) {
311 static_assert(DeviceOptions::kMask == 0xf,
312 "needs update for new device options");
313
314 if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
315 *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
316 }
317
318 // If no flags are set the default is CU_CTX_SCHED_AUTO, which
319 // in Google environments is very likely to mean SPIN.
320 if (device_options.flags() & DeviceOptions::kScheduleSpin) {
321 *flags |= CU_CTX_SCHED_SPIN;
322 }
323 if (device_options.flags() & DeviceOptions::kScheduleYield) {
324 *flags |= CU_CTX_SCHED_YIELD;
325 }
326 if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
327 *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
328 }
329
330 return true;
331 }
332
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)333 /* static */ port::Status GpuDriver::CreateContext(
334 int device_ordinal, CUdevice device, const DeviceOptions& device_options,
335 GpuContext** context) {
336 *context = nullptr;
337
338 int flags = 0;
339 if (!DeviceOptionsToContextFlags(device_options, &flags)) {
340 LOG(WARNING) << "could not convert all device options into context flags";
341 }
342
343 CUresult res;
344 CUcontext former_context;
345 CUcontext new_context;
346
347 unsigned int former_primary_context_flags;
348 int former_primary_context_is_active;
349 CHECK_EQ(CUDA_SUCCESS,
350 cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
351 &former_primary_context_is_active));
352 if (former_primary_context_flags != flags) {
353 if (former_primary_context_is_active) {
354 LOG(ERROR)
355 << "The primary context is active and has a different flag set ("
356 << former_primary_context_flags << ") than the desired flag set ("
357 << flags << ").";
358 } else {
359 CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
360 }
361 }
362
363 former_context = cuda::CurrentContextOrDie();
364 res = cuDevicePrimaryCtxRetain(&new_context, device);
365 if (former_context != nullptr) {
366 CUdevice former_device;
367 if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
368 if (former_device == device) {
369 if (former_context == new_context) {
370 VLOG(2) << "The primary context " << former_context << " for device "
371 << device
372 << " exists before initializing the StreamExecutor.";
373 } else {
374 LOG(WARNING) << "A non-primary context " << former_context
375 << " for device " << device
376 << " exists before initializing the StreamExecutor. The "
377 << "primary context is now " << new_context << ". We "
378 << "haven't verified StreamExecutor works with that.";
379 }
380 }
381 } else {
382 LOG(ERROR) << "Failed to get the device of the current context "
383 << former_context;
384 }
385 }
386 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
387
388 if (res == CUDA_SUCCESS) {
389 *context = CreatedContexts::Add(new_context, device_ordinal);
390 CHECK(*context != nullptr)
391 << "success in this call must entail non-null result";
392 VLOG(2) << "created or reused context " << new_context
393 << " for this thread";
394 return port::Status::OK();
395 }
396
397 std::string message =
398 "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
399 if (res == CUDA_ERROR_OUT_OF_MEMORY) {
400 uint64 total_memory;
401 if (GetDeviceTotalMemory(device, &total_memory)) {
402 absl::StrAppend(&message, "; total memory reported: ", total_memory);
403 } else {
404 absl::StrAppend(&message, "; could not query total memory");
405 }
406 }
407
408 return port::Status(port::error::INTERNAL, message);
409 }
410
DestroyContext(GpuContext * context)411 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
412 if (context == nullptr) {
413 return;
414 }
415 CUcontext former_context = CurrentContext();
416 CUresult res = cuCtxSetCurrent(context->context());
417 CUdevice device;
418 cuCtxGetDevice(&device);
419 cuCtxSetCurrent(former_context);
420
421 res = cuDevicePrimaryCtxRelease(device);
422
423 if (res != CUDA_SUCCESS) {
424 LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
425 }
426
427 CreatedContexts::Remove(context->context());
428 }
429
GetContextHandle(GpuContext * context)430 /* static */ CUcontext GpuDriver::GetContextHandle(GpuContext* context) {
431 return context->context();
432 }
433
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)434 /* static */ port::Status GpuDriver::FuncGetAttribute(
435 CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
436 RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
437 "Failed to query kernel attribute: ", attribute);
438 return port::Status::OK();
439 }
440
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)441 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
442 CUfunction function, CUfunc_cache cache_config) {
443 RETURN_IF_CUDA_RES_ERROR(cuFuncSetCacheConfig(function, cache_config),
444 "Failed to set CUDA kernel cache config");
445 return port::Status::OK();
446 }
447
448 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)449 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
450 CUsharedconfig shared_mem_config;
451 ScopedActivateContext activation(context);
452 RETURN_IF_CUDA_RES_ERROR(cuCtxGetSharedMemConfig(&shared_mem_config),
453 "Failed to get shared memory config");
454 return shared_mem_config;
455 }
456
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)457 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
458 GpuContext* context, CUsharedconfig shared_mem_config) {
459 ScopedActivateContext activation(context);
460 RETURN_IF_CUDA_RES_ERROR(cuCtxSetSharedMemConfig(shared_mem_config),
461 "Failed to set shared memory config");
462 return port::Status::OK();
463 }
464
LaunchKernel(GpuContext * context,absl::string_view kernel_name,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)465 /* static */ port::Status GpuDriver::LaunchKernel(
466 GpuContext* context, absl::string_view kernel_name, CUfunction function,
467 unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
468 unsigned int block_dim_x, unsigned int block_dim_y,
469 unsigned int block_dim_z, unsigned int shared_mem_bytes, CUstream stream,
470 void** kernel_params, void** extra) {
471 ScopedActivateContext activation(context);
472 VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
473 << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
474 << " bdx: " << block_dim_x << " bdy: " << block_dim_y
475 << " bdz: " << block_dim_z;
476 RETURN_IF_CUDA_RES_ERROR(
477 cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
478 block_dim_y, block_dim_z, shared_mem_bytes, stream,
479 kernel_params, extra),
480 "Failed to launch CUDA kernel: ", kernel_name,
481 " with block dimensions: ", block_dim_x, "x", block_dim_y, "x",
482 block_dim_z, " and grid dimensions: ", grid_dim_x, "x", grid_dim_y, "x",
483 grid_dim_z);
484 return port::Status::OK();
485 }
486
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)487 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
488 const char* cubin_bytes,
489 CUmodule* module) {
490 ScopedActivateContext activation(context);
491 RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
492 "Failed to load in-memory CUBIN");
493 return port::Status::OK();
494 }
495
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)496 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
497 const char* ptx_contents,
498 CUmodule* module) {
499 absl::Notification notification;
500 port::Status ret = port::Status::OK();
501 GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
502 ¬ification]() {
503 ScopedActivateContext activation(context);
504 void* ptx_data = const_cast<char*>(ptx_contents);
505 static const unsigned int kLogBufferBytesLimit = 1024;
506 unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
507 unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
508 absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
509 absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
510 bool log_verbose = true;
511 CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
512 CU_JIT_ERROR_LOG_BUFFER,
513 CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
514 CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
515 // Note that the driver API wants the contents of this values to be stored
516 // in an array of void*s, so we coerce them accordingly.
517 void* option_values[] = {
518 absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
519 absl::bit_cast<void*>(error_log_buffer.data()),
520 absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
521 absl::bit_cast<void*>(info_log_buffer.data()),
522 absl::bit_cast<void*>(uintptr_t(log_verbose))};
523 CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
524
525 CUresult res;
526 {
527 // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
528 // module loading: see http://b/13248943
529 absl::LeakCheckDisabler disabler;
530 res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
531 option_values);
532 }
533
534 // The PTX JIT mutates the values in the option values array to reflect the
535 // size of the logs it output; now that we've made the call, read the values
536 // back out.
537 error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
538 info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
539 CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
540 CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
541
542 if (res != CUDA_SUCCESS) {
543 LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
544 // As a precaution for null termination of the API-provided value, ensure
545 // that at least the last byte is null.
546 error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
547 : 0] = '\0';
548 LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
549 << " bytes): " << error_log_buffer.data();
550 ret = port::InternalError(
551 absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
552 notification.Notify();
553 }
554
555 VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
556 << " bytes): " << info_log_buffer.data();
557 VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
558 << " bytes): " << error_log_buffer.data();
559 CHECK(module != nullptr);
560 notification.Notify();
561 });
562 notification.WaitForNotification();
563
564 return ret;
565 }
566
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)567 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
568 const char* hsaco_contents,
569 CUmodule* module) {
570 return port::InternalError(
571 "Feature not supported on CUDA platform (LoadHsaco)");
572 }
573
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)574 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
575 GpuContext* context, CUdeviceptr location, uint8 value, size_t size) {
576 ScopedActivateContext activation(context);
577 RETURN_IF_CUDA_RES_ERROR(cuMemsetD8(location, value, size),
578 "Failed to memset memory");
579 return port::Status::OK();
580 }
581
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)582 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
583 GpuContext* context, CUdeviceptr location, uint32 value,
584 size_t uint32_count) {
585 ScopedActivateContext activation(context);
586 RETURN_IF_CUDA_RES_ERROR(cuMemsetD32(location, value, uint32_count),
587 "Failed to memset memory");
588 return port::Status::OK();
589 }
590
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)591 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
592 GpuContext* context, CUdeviceptr location, uint8 value, size_t uint32_count,
593 CUstream stream) {
594 ScopedActivateContext activation(context);
595 RETURN_IF_CUDA_RES_ERROR(
596 cuMemsetD8Async(location, value, uint32_count, stream),
597 "Failed to enqueue async memset operation");
598 return port::Status::OK();
599 }
600
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)601 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
602 GpuContext* context, CUdeviceptr location, uint32 value,
603 size_t uint32_count, CUstream stream) {
604 ScopedActivateContext activation(context);
605 RETURN_IF_CUDA_RES_ERROR(
606 cuMemsetD32Async(location, value, uint32_count, stream),
607 "Failed to enqueue async memset operation");
608 return port::Status::OK();
609 }
610
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)611 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
612 CUstream stream,
613 StreamCallback callback,
614 void* data) {
615 // Note: flags param is required to be zero according to CUDA 6.0.
616 CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
617 if (res != CUDA_SUCCESS) {
618 LOG(ERROR) << "unable to add host callback: " << ToString(res);
619 return false;
620 }
621 return true;
622 }
623
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)624 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
625 CUmodule module,
626 const char* kernel_name,
627 CUfunction* function) {
628 ScopedActivateContext activated{context};
629 CHECK(module != nullptr && kernel_name != nullptr);
630 CUresult res = cuModuleGetFunction(function, module, kernel_name);
631 if (res != CUDA_SUCCESS) {
632 LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
633 << "\" from module: " << ToString(res);
634 return false;
635 }
636
637 return true;
638 }
639
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)640 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
641 CUmodule module,
642 const char* symbol_name,
643 CUdeviceptr* dptr, size_t* bytes) {
644 ScopedActivateContext activated{context};
645 CHECK(module != nullptr && symbol_name != nullptr &&
646 (dptr != nullptr || bytes != nullptr));
647 CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
648 if (res != CUDA_SUCCESS) {
649 // symbol may not be found in the current module, but it may reside in
650 // another module.
651 VLOG(2) << "failed to get symbol \"" << symbol_name
652 << "\" from module: " << ToString(res);
653 return false;
654 }
655
656 return true;
657 }
658
UnloadModule(GpuContext * context,CUmodule module)659 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
660 CUmodule module) {
661 ScopedActivateContext activated{context};
662 CUresult res = cuModuleUnload(module);
663 if (res != CUDA_SUCCESS) {
664 LOG(ERROR) << "failed to unload module " << module
665 << "; leaking: " << ToString(res);
666 }
667 }
668
DeviceFromContext(GpuContext * context)669 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
670 GpuContext* context) {
671 ScopedActivateContext activated{context};
672 CUdevice device = -1;
673 CUresult result = cuCtxGetDevice(&device);
674 if (result == CUDA_SUCCESS) {
675 return device;
676 }
677
678 return port::Status(
679 port::error::INTERNAL,
680 absl::StrCat("failed to get device for context: ", ToString(result)));
681 }
682
CreateStream(GpuContext * context,CUstream * stream,int priority)683 /* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
684 int priority) {
685 // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
686 // up synchronization with respect to memsets and any other things that have
687 // to occur on the default stream?
688 ScopedActivateContext activated{context};
689 CUresult res;
690 // If the priority is 0, then use the previous api to create the stream with
691 // the default priority for backward compatibility. Probably there is no
692 // difference in using the new api call but leaving it as is for now.
693 if (priority == 0) {
694 res = cuStreamCreate(stream, 0);
695 } else {
696 res = cuStreamCreateWithPriority(stream, 0, priority);
697 }
698 if (res != CUDA_SUCCESS) {
699 LOG(ERROR) << "could not allocate CUDA stream for context "
700 << context->context() << ": " << ToString(res);
701 return false;
702 }
703
704 VLOG(2) << "successfully created stream " << *stream << " for context "
705 << context->context() << " on thread";
706 return true;
707 }
708
DestroyStream(GpuContext * context,CUstream * stream)709 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
710 CUstream* stream) {
711 if (*stream == nullptr) {
712 return;
713 }
714
715 ScopedActivateContext activated{context};
716 CUresult res = cuStreamDestroy(*stream);
717 if (res != CUDA_SUCCESS) {
718 LOG(ERROR) << "failed to destroy CUDA stream for context "
719 << context->context() << ": " << ToString(res);
720 } else {
721 VLOG(2) << "successfully destroyed stream " << *stream << " for context "
722 << context->context();
723 *stream = nullptr;
724 }
725 }
726
DeviceAllocate(GpuContext * context,uint64 bytes)727 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
728 uint64 bytes) {
729 if (bytes == 0) {
730 return nullptr;
731 }
732
733 ScopedActivateContext activated{context};
734 CUdeviceptr result = 0;
735 CUresult res = cuMemAlloc(&result, bytes);
736 if (res != CUDA_SUCCESS) {
737 // LOG(INFO) because this isn't always important to users (e.g. BFCAllocator
738 // implements a retry if the first allocation fails).
739 LOG(INFO) << "failed to allocate "
740 << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
741 << " bytes) from device: " << ToString(res);
742 return nullptr;
743 }
744 void* ptr = reinterpret_cast<void*>(result);
745 VLOG(2) << "allocated " << ptr << " for context " << context->context()
746 << " of " << bytes << " bytes";
747 return ptr;
748 }
749
DeviceDeallocate(GpuContext * context,void * location)750 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
751 void* location) {
752 ScopedActivateContext activation(context);
753 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
754 CUresult res = cuMemFree(pointer);
755 if (res != CUDA_SUCCESS) {
756 LOG(ERROR) << "failed to free device memory at " << location
757 << "; result: " << ToString(res);
758 } else {
759 VLOG(2) << "deallocated " << location << " for context "
760 << context->context();
761 }
762 }
763
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)764 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
765 uint64 bytes) {
766 ScopedActivateContext activation(context);
767 CUdeviceptr result = 0;
768 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
769 CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
770 if (res != CUDA_SUCCESS) {
771 LOG(ERROR) << "failed to alloc " << bytes
772 << " bytes unified memory; result: " << ToString(res);
773 return nullptr;
774 }
775 void* ptr = reinterpret_cast<void*>(result);
776 VLOG(2) << "allocated " << ptr << " for context " << context->context()
777 << " of " << bytes << " bytes in unified memory";
778 return ptr;
779 }
780
UnifiedMemoryDeallocate(GpuContext * context,void * location)781 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
782 void* location) {
783 ScopedActivateContext activation(context);
784 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
785 CUresult res = cuMemFree(pointer);
786 if (res != CUDA_SUCCESS) {
787 LOG(ERROR) << "failed to free unified memory at " << location
788 << "; result: " << ToString(res);
789 } else {
790 VLOG(2) << "deallocated unified memory at " << location << " for context "
791 << context->context();
792 }
793 }
794
HostAllocate(GpuContext * context,uint64 bytes)795 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
796 ScopedActivateContext activation(context);
797 void* host_mem = nullptr;
798 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
799 CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
800 if (res != CUDA_SUCCESS) {
801 LOG(ERROR) << "failed to alloc " << bytes
802 << " bytes on host: " << ToString(res);
803 }
804 return host_mem;
805 }
806
HostDeallocate(GpuContext * context,void * location)807 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
808 void* location) {
809 ScopedActivateContext activation(context);
810 CUresult res = cuMemFreeHost(location);
811 if (res != CUDA_SUCCESS) {
812 LOG(ERROR) << "error deallocating host memory at " << location << ": "
813 << ToString(res);
814 }
815 }
816
HostRegister(GpuContext * context,void * location,uint64 bytes)817 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
818 uint64 bytes) {
819 ScopedActivateContext activation(context);
820 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
821 CUresult res =
822 cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
823 if (res != CUDA_SUCCESS) {
824 LOG(ERROR) << "error registering host memory at " << location << ": "
825 << ToString(res);
826 return false;
827 }
828 return true;
829 }
830
HostUnregister(GpuContext * context,void * location)831 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
832 void* location) {
833 ScopedActivateContext activation(context);
834 CUresult res = cuMemHostUnregister(location);
835 if (res != CUDA_SUCCESS) {
836 LOG(ERROR) << "error unregistering host memory at " << location << ": "
837 << ToString(res);
838 return false;
839 }
840 return true;
841 }
842
843 #if CUDA_VERSION >= 10020
844 /* static */ port::StatusOr<GpuDriver::VmemSpan>
ReserveVirtualMemory(GpuContext * context,uint64 bytes)845 GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
846 ScopedActivateContext activation(context);
847 CUdeviceptr base;
848 CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
849 /*addr=*/0, /*flags=*/0);
850 if (res != CUDA_SUCCESS) {
851 return port::InternalError(
852 absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
853 bytes, ToString(res)));
854 }
855 return {{base, bytes}};
856 }
857
FreeVirtualMemory(GpuContext * context,GpuDriver::VmemSpan reservation)858 /* static */ void GpuDriver::FreeVirtualMemory(
859 GpuContext* context, GpuDriver::VmemSpan reservation) {
860 ScopedActivateContext activation(context);
861 CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
862 if (res != CUDA_SUCCESS) {
863 LOG(ERROR) << "error freeing vmem reservation of size "
864 << reservation.size_bytes << " at address " << reservation.base;
865 }
866 }
867
GetMinAllocationGranularity(GpuDeviceHandle device)868 /* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
869 GpuDeviceHandle device) {
870 CUmemAllocationProp props = {};
871 props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
872 props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
873 props.location.id = device;
874
875 size_t granularity;
876 CUresult res = cuMemGetAllocationGranularity(
877 &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
878 if (res != CUDA_SUCCESS) {
879 return port::InternalError(absl::StrCat(
880 "failed to get min allocation granularity: ", ToString(res)));
881 }
882 return granularity;
883 }
884
885 /* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
CreateMemoryHandle(GpuContext * context,uint64 bytes)886 GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
887 ScopedActivateContext activation(context);
888 auto device = DeviceFromContext(context);
889 if (!device.ok()) {
890 LOG(ERROR) << "Failed to get device from context" << device.status();
891 return device.status();
892 }
893
894 CUmemAllocationProp props = {};
895 props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
896 props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
897 props.location.id = device.ValueOrDie();
898
899 CUmemGenericAllocationHandle mem_handle;
900 CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
901 if (res != CUDA_SUCCESS) {
902 return port::InternalError(
903 absl::StrFormat("failed to create memory allocation of size %d: %s",
904 bytes, ToString(res)));
905 }
906 return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
907 }
908
ReleaseMemoryHandle(GpuContext * context,GpuDriver::GenericMemoryHandle handle)909 /* static */ void GpuDriver::ReleaseMemoryHandle(
910 GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
911 ScopedActivateContext activation(context);
912
913 CUresult res = cuMemRelease(handle.handle);
914 if (res != CUDA_SUCCESS) {
915 LOG(ERROR) << "Failed to release memory handle " << handle.handle
916 << " of size " << handle.bytes << ": " << ToString(res);
917 }
918 }
919
MapMemory(GpuContext * context,CUdeviceptr va,const GpuDriver::GenericMemoryHandle & handle,const std::vector<GpuDeviceHandle> & device_handles)920 /* static */ port::Status GpuDriver::MapMemory(
921 GpuContext* context, CUdeviceptr va,
922 const GpuDriver::GenericMemoryHandle& handle,
923 const std::vector<GpuDeviceHandle>& device_handles) {
924 ScopedActivateContext activation(context);
925
926 auto device = DeviceFromContext(context);
927 if (!device.ok()) {
928 return device.status();
929 }
930
931 // NB: Zero is the only valid value for both flags and offset.
932 CUresult res =
933 cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
934 if (res != CUDA_SUCCESS) {
935 return port::InternalError(absl::StrFormat(
936 "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
937 }
938
939 std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
940 for (int i = 0; i < access_descriptors.size(); ++i) {
941 access_descriptors[i].location.id = device_handles[i];
942 access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
943 access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
944 }
945
946 res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
947 access_descriptors.size());
948 if (res != CUDA_SUCCESS) {
949 // Unmap the memory that we failed to set access for.
950 if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
951 LOG(ERROR)
952 << "Failed to unmap memory in GpuDriver::MapMemory error path.";
953 }
954 return port::InternalError(absl::StrFormat(
955 "Failed to set read/write access on memory mapped at %d: %s", va,
956 ToString(res)));
957 }
958 return port::Status::OK();
959 }
960
UnmapMemory(GpuContext * context,CUdeviceptr va,uint64 bytes)961 /* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
962 uint64 bytes) {
963 ScopedActivateContext activation(context);
964
965 CUresult res = cuMemUnmap(va, bytes);
966 if (res != CUDA_SUCCESS) {
967 LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
968 << ": " << ToString(res);
969 }
970 }
971
972 #endif
973
DestroyEvent(GpuContext * context,CUevent * event)974 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
975 CUevent* event) {
976 if (*event == nullptr) {
977 return port::Status(port::error::INVALID_ARGUMENT,
978 "input event cannot be null");
979 }
980
981 ScopedActivateContext activated{context};
982 RETURN_IF_CUDA_RES_ERROR(cuEventDestroy(*event),
983 "Error destroying CUDA event");
984 return port::Status::OK();
985 }
986
RecordEvent(GpuContext * context,CUevent event,CUstream stream)987 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
988 CUevent event,
989 CUstream stream) {
990 ScopedActivateContext activated{context};
991 RETURN_IF_CUDA_RES_ERROR(cuEventRecord(event, stream),
992 "Error recording CUDA event");
993 return port::Status::OK();
994 }
995
QueryEvent(GpuContext * context,CUevent event)996 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
997 CUevent event) {
998 ScopedActivateContext activated{context};
999 CUresult res = cuEventQuery(event);
1000 if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
1001 return port::Status(
1002 port::error::INTERNAL,
1003 absl::StrFormat("failed to query event: %s", ToString(res)));
1004 }
1005
1006 return res;
1007 }
1008
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1009 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1010 float* elapsed_milliseconds,
1011 CUevent start, CUevent stop) {
1012 ScopedActivateContext activated{context};
1013 // The stop event must have completed in order for cuEventElapsedTime to
1014 // work.
1015 CUresult res = cuEventSynchronize(stop);
1016 if (res != CUDA_SUCCESS) {
1017 LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1018 return false;
1019 }
1020 res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1021 if (res != CUDA_SUCCESS) {
1022 LOG(ERROR) << "failed to get elapsed time between events: "
1023 << ToString(res);
1024 return false;
1025 }
1026
1027 return true;
1028 }
1029
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1030 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1031 CUstream stream, CUevent event) {
1032 ScopedActivateContext activation(context);
1033 CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1034 if (res != CUDA_SUCCESS) {
1035 LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1036 return false;
1037 }
1038
1039 return true;
1040 }
1041
SynchronizeContext(GpuContext * context)1042 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1043 ScopedActivateContext activation(context);
1044 CUresult res = cuCtxSynchronize();
1045 if (res != CUDA_SUCCESS) {
1046 LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1047 << " :: " << port::CurrentStackTrace();
1048 return false;
1049 }
1050
1051 return true;
1052 }
1053
SynchronizeStream(GpuContext * context,CUstream stream)1054 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1055 CUstream stream) {
1056 ScopedActivateContext activated{context};
1057 CHECK(stream != nullptr);
1058 RETURN_IF_CUDA_RES_ERROR(cuStreamSynchronize(stream),
1059 "Could not synchronize CUDA stream");
1060 return port::Status::OK();
1061 }
1062
IsStreamIdle(GpuContext * context,CUstream stream)1063 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1064 CUstream stream) {
1065 ScopedActivateContext activated{context};
1066 CHECK(stream != nullptr);
1067 CUresult res = cuStreamQuery(stream);
1068 if (res == CUDA_SUCCESS) {
1069 return true;
1070 }
1071
1072 if (res != CUDA_ERROR_NOT_READY) {
1073 LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1074 }
1075 return false;
1076 }
1077
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size)1078 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1079 void* host_dst,
1080 CUdeviceptr gpu_src,
1081 uint64 size) {
1082 ScopedActivateContext activation(context);
1083 RETURN_IF_CUDA_RES_ERROR(
1084 cuMemcpyDtoH(host_dst, gpu_src, size),
1085 absl::StrFormat("failed to synchronous memcpy from device to host "
1086 "host dst: %p; GPU src: %p; size: %u=0x%x",
1087 host_dst, absl::bit_cast<void*>(gpu_src), size, size));
1088 VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1089 << host_dst;
1090 return port::Status::OK();
1091 }
1092
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size)1093 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1094 CUdeviceptr gpu_dst,
1095 const void* host_src,
1096 uint64 size) {
1097 ScopedActivateContext activation(context);
1098 RETURN_IF_CUDA_RES_ERROR(
1099 cuMemcpyHtoD(gpu_dst, host_src, size),
1100 absl::StrFormat(
1101 "failed to synchronous memcpy from host to device: GPU dst: %p;"
1102 " host src: %p; size: %u=0x%x",
1103 absl::bit_cast<void*>(gpu_dst), host_src, size, size));
1104 VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1105 return port::Status::OK();
1106 }
1107
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size)1108 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1109 CUdeviceptr gpu_dst,
1110 CUdeviceptr gpu_src,
1111 uint64 size) {
1112 ScopedActivateContext activation(context);
1113
1114 CUresult result;
1115 // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1116 // This happens when the size is 0.
1117 if (gpu_dst == 0 || gpu_src == 0 || !UseCudaMallocAsyncAllocator()) {
1118 result = cuMemcpyDtoD(gpu_dst, gpu_src, size);
1119 } else {
1120 // Any context work here.
1121 CUcontext dst_context =
1122 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1123 CUcontext src_context =
1124 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1125
1126 if (static_cast<void*>(dst_context) == nullptr) {
1127 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1128 if (tmp_context.ok()) {
1129 dst_context = tmp_context.ValueOrDie()->context();
1130 }
1131 }
1132
1133 if (static_cast<void*>(src_context) == nullptr) {
1134 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1135 if (tmp_context.ok()) {
1136 src_context = tmp_context.ValueOrDie()->context();
1137 }
1138 }
1139
1140 result = cuMemcpyPeer(gpu_dst, dst_context, gpu_src, src_context, size);
1141 }
1142
1143 RETURN_IF_CUDA_RES_ERROR(
1144 result,
1145 absl::StrFormat(
1146 "failed to synchronous memcpy from host to device: GPU dst: %p; "
1147 "GPU src: %p; size: %u=0x%x",
1148 absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
1149 size));
1150 VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1151 return port::Status::OK();
1152 }
1153
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1154 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1155 void* host_dst,
1156 CUdeviceptr gpu_src,
1157 uint64 size,
1158 CUstream stream) {
1159 ScopedActivateContext activation(context);
1160 CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1161 if (res != CUDA_SUCCESS) {
1162 LOG(ERROR) << absl::StrFormat(
1163 "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1164 "GPU src: %p; size: %u=0x%x",
1165 ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size);
1166 return false;
1167 }
1168 VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1169 << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1170 << host_dst << " on stream " << stream;
1171 return true;
1172 }
1173
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size,CUstream stream)1174 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1175 CUdeviceptr gpu_dst,
1176 const void* host_src,
1177 uint64 size,
1178 CUstream stream) {
1179 ScopedActivateContext activation(context);
1180 CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1181 if (res != CUDA_SUCCESS) {
1182 LOG(ERROR) << absl::StrFormat(
1183 "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1184 "host src: %p; size: %u=0x%x",
1185 ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size);
1186 return false;
1187 }
1188 VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1189 << " on stream " << stream;
1190 return true;
1191 }
1192
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1193 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1194 CUdeviceptr gpu_dst,
1195 CUdeviceptr gpu_src,
1196 uint64 size,
1197 CUstream stream) {
1198 ScopedActivateContext activation(context);
1199 CUresult result;
1200 // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1201 // This happens when the size is 0.
1202 if (gpu_dst == 0 || gpu_src == 0 || !UseCudaMallocAsyncAllocator()) {
1203 result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1204 } else {
1205 // Any context work here.
1206 CUcontext dst_context =
1207 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1208 CUcontext src_context =
1209 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1210
1211 if (static_cast<void*>(dst_context) == nullptr) {
1212 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1213 if (tmp_context.ok()) {
1214 dst_context = tmp_context.ValueOrDie()->context();
1215 }
1216 }
1217
1218 if (static_cast<void*>(src_context) == nullptr) {
1219 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1220 if (tmp_context.ok()) {
1221 src_context = tmp_context.ValueOrDie()->context();
1222 }
1223 }
1224
1225 result = cuMemcpyPeerAsync(gpu_dst, dst_context, gpu_src, src_context, size,
1226 stream);
1227 }
1228 if (result != CUDA_SUCCESS) {
1229 LOG(ERROR) << absl::StrFormat(
1230 "failed to enqueue async memcpy from device to device: %s"
1231 "; GPU dst: %p on %s %s"
1232 "; GPU src: %p on %s %s"
1233 "; can access? %s; size: %u=0x%x",
1234 ToString(result), absl::bit_cast<void*>(gpu_dst),
1235 CUDAPointerToMemorySpaceString(gpu_dst),
1236 CUDAPointerToDeviceString(gpu_dst), absl::bit_cast<void*>(gpu_src),
1237 CUDAPointerToMemorySpaceString(gpu_src),
1238 CUDAPointerToDeviceString(gpu_src),
1239 CUDAPointersToCanAccessString(gpu_src, gpu_dst), size, size);
1240
1241 return false;
1242 }
1243 VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1244 return true;
1245 }
1246
InitEvent(GpuContext * context,CUevent * result,EventFlags flags)1247 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
1248 CUevent* result,
1249 EventFlags flags) {
1250 int cuflags;
1251 switch (flags) {
1252 case EventFlags::kDefault:
1253 cuflags = CU_EVENT_DEFAULT;
1254 break;
1255 case EventFlags::kDisableTiming:
1256 cuflags = CU_EVENT_DISABLE_TIMING;
1257 break;
1258 default:
1259 LOG(FATAL) << "impossible event flags: " << int(flags);
1260 }
1261
1262 ScopedActivateContext activated{context};
1263 CUresult res = cuEventCreate(result, cuflags);
1264
1265 if (res == CUDA_SUCCESS) {
1266 return port::Status::OK();
1267 } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1268 return port::Status(port::error::RESOURCE_EXHAUSTED,
1269 "could not create CUDA event: out of device memory");
1270 } else {
1271 return port::Status(
1272 port::error::FAILED_PRECONDITION,
1273 absl::StrCat("could not create CUDA event: ", ToString(res)));
1274 }
1275 }
1276
GetDeviceCount()1277 /* static */ int GpuDriver::GetDeviceCount() {
1278 int device_count = 0;
1279 CUresult res = cuDeviceGetCount(&device_count);
1280 if (res != CUDA_SUCCESS) {
1281 LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1282 return 0;
1283 }
1284
1285 if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1286 device_count = 1;
1287 }
1288 return device_count;
1289 }
1290
GetPointerContext(CUdeviceptr pointer)1291 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1292 CUdeviceptr pointer) {
1293 GpuContext* context = nullptr;
1294 CUresult result =
1295 cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1296 if (result == CUDA_SUCCESS) {
1297 // For cudaMallocAsync, the context returned is null. For now
1298 // return not-available. But how to manage that correctly
1299 // everywhere in TF? Currently this is only used during error
1300 // handling. So all is working fine, but TF have a different
1301 // error then the original one.
1302 if (context == nullptr) {
1303 return port::Status(
1304 port::error::UNAVAILABLE,
1305 absl::StrCat("failed to query context for device pointer: ",
1306 ToString(result)));
1307 }
1308 return context;
1309 }
1310
1311 return port::Status(
1312 port::error::INTERNAL,
1313 absl::StrCat("failed to query context for device pointer: ",
1314 ToString(result)));
1315 }
1316
GetPointerMemorySpace(CUdeviceptr pointer)1317 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1318 CUdeviceptr pointer) {
1319 unsigned int value;
1320 CUresult result =
1321 cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1322 if (result == CUDA_SUCCESS) {
1323 switch (value) {
1324 case CU_MEMORYTYPE_DEVICE:
1325 return MemorySpace::kDevice;
1326 case CU_MEMORYTYPE_HOST:
1327 return MemorySpace::kHost;
1328 default:
1329 return port::Status(
1330 port::error::INTERNAL,
1331 absl::StrCat("unknown memory space provided by CUDA API: ", value));
1332 }
1333 }
1334
1335 return port::Status(
1336 port::error::INTERNAL,
1337 absl::StrCat("failed to query device pointer for memory space: ",
1338 ToString(result)));
1339 }
1340
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1341 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1342 CUdeviceptr* base,
1343 size_t* size) {
1344 CUresult result = cuMemGetAddressRange(base, size, dptr);
1345 if (result == CUDA_SUCCESS) {
1346 return port::Status::OK();
1347 } else if (result == CUDA_ERROR_NOT_FOUND) {
1348 // We differentiate between "this pointer is unknown" (return here) and
1349 // "there was an internal error while performing this operation" (return
1350 // below).
1351 return port::Status(
1352 port::error::NOT_FOUND,
1353 absl::StrFormat("not a device pointer %p; %s",
1354 reinterpret_cast<void*>(dptr), ToString(result)));
1355 }
1356
1357 return port::Status(
1358 port::error::INTERNAL,
1359 absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1360 reinterpret_cast<void*>(dptr), ToString(result)));
1361 }
1362
GetPointerDevice(CUdeviceptr pointer)1363 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1364 CUdeviceptr pointer) {
1365 auto result = GetPointerContext(pointer);
1366 if (!result.ok()) {
1367 return result.status();
1368 }
1369
1370 return DeviceFromContext(result.ValueOrDie());
1371 }
1372
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1373 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1374 int* cc_minor,
1375 CUdevice device) {
1376 *cc_major = 0;
1377 *cc_minor = 0;
1378
1379 CUresult res = cuDeviceGetAttribute(
1380 cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1381 if (res != CUDA_SUCCESS) {
1382 return port::Status(
1383 port::error::INTERNAL,
1384 absl::StrFormat(
1385 "failed to get compute capability major for device: %s; %d",
1386 ToString(res), device));
1387 }
1388
1389 res = cuDeviceGetAttribute(
1390 cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1391 if (res != CUDA_SUCCESS) {
1392 return port::Status(
1393 port::error::INTERNAL,
1394 absl::StrFormat(
1395 "failed to get compute capability minor for device: %s; %d",
1396 ToString(res), device));
1397 }
1398
1399 return port::Status::OK();
1400 }
1401
GetGpuISAVersion(int * version,CUdevice device)1402 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1403 CUdevice device) {
1404 return port::Status{
1405 port::error::INTERNAL,
1406 "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1407 }
1408
GetGpuGCNArchName(CUdevice,std::string *)1409 /* static */ port::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
1410 return port::Status{
1411 port::error::INTERNAL,
1412 "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
1413 }
1414
1415 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1416 // T and wraps it in a StatusOr.
1417 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1418 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1419 CUdevice_attribute attribute) {
1420 int value = -1;
1421 RETURN_IF_CUDA_RES_ERROR(cuDeviceGetAttribute(&value, attribute, device),
1422 "Could not retrieve CUDA device attribute (",
1423 attribute);
1424 T converted = value;
1425 return converted;
1426 }
1427
GetMultiprocessorCount(CUdevice device)1428 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1429 CUdevice device) {
1430 return GetSimpleAttribute<int>(device,
1431 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1432 }
1433
GetMaxSharedMemoryPerCore(CUdevice device)1434 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1435 CUdevice device) {
1436 return GetSimpleAttribute<int64>(
1437 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1438 }
1439
GetMaxSharedMemoryPerBlock(CUdevice device)1440 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1441 CUdevice device) {
1442 return GetSimpleAttribute<int64>(
1443 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1444 }
1445
GetMaxThreadsPerMultiprocessor(CUdevice device)1446 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1447 CUdevice device) {
1448 return GetSimpleAttribute<int64>(
1449 device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1450 }
1451
GetMaxThreadsPerBlock(CUdevice device)1452 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1453 CUdevice device) {
1454 return GetSimpleAttribute<int64>(device,
1455 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1456 }
1457
GetMaxRegistersPerBlock(CUdevice device)1458 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1459 CUdevice device) {
1460 return GetSimpleAttribute<int64>(device,
1461 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1462 }
1463
GetThreadsPerWarp(CUdevice device)1464 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1465 CUdevice device) {
1466 return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1467 }
1468
GetGridLimits(int * x,int * y,int * z,CUdevice device)1469 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1470 CUdevice device) {
1471 int value;
1472 CUresult res =
1473 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1474 if (res != CUDA_SUCCESS) {
1475 LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1476 return false;
1477 }
1478 *x = value;
1479
1480 res =
1481 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1482 if (res != CUDA_SUCCESS) {
1483 LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1484 return false;
1485 }
1486 *y = value;
1487
1488 res =
1489 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1490 if (res != CUDA_SUCCESS) {
1491 LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1492 return false;
1493 }
1494 *z = value;
1495 return true;
1496 }
1497
GetDriverVersion(int * driver_version)1498 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1499 CUresult res = cuDriverGetVersion(driver_version);
1500 if (res != CUDA_SUCCESS) {
1501 LOG(ERROR) << "failed to query driver version: " << ToString(res);
1502 return false;
1503 }
1504
1505 return true;
1506 }
1507
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1508 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1509 int device_ordinal) {
1510 CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1511 if (res != CUDA_SUCCESS) {
1512 LOG(ERROR) << "failed to query device properties: " << ToString(res);
1513 return false;
1514 }
1515
1516 return true;
1517 }
1518
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1519 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1520 CUdevice_attribute attribute, CUdevice device) {
1521 int val;
1522 CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1523 if (res != CUDA_SUCCESS) {
1524 return port::Status(
1525 port::error::INTERNAL,
1526 absl::StrFormat("failed to get device attribute %d for device %d: %s",
1527 attribute, device, ToString(res)));
1528 }
1529 return val;
1530 }
1531
IsEccEnabled(CUdevice device,bool * result)1532 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1533 int value = -1;
1534 CUresult res =
1535 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1536 if (res != CUDA_SUCCESS) {
1537 LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1538 return false;
1539 }
1540
1541 *result = value;
1542 return true;
1543 }
1544
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1545 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1546 int64* free_out,
1547 int64* total_out) {
1548 ScopedActivateContext activation(context);
1549 size_t free = 0;
1550 size_t total = 0;
1551 CUresult res = cuMemGetInfo(&free, &total);
1552 if (res != CUDA_SUCCESS) {
1553 LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1554 return false;
1555 }
1556
1557 *free_out = free;
1558 *total_out = total;
1559 return true;
1560 }
1561
GetDeviceTotalMemory(CUdevice device,uint64 * result)1562 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1563 uint64* result) {
1564 size_t value = -1;
1565 CUresult res = cuDeviceTotalMem(&value, device);
1566 if (res != CUDA_SUCCESS) {
1567 LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1568 return false;
1569 }
1570
1571 *result = value;
1572 return true;
1573 }
1574
GetPCIBusID(CUdevice device)1575 /* static */ std::string GpuDriver::GetPCIBusID(CUdevice device) {
1576 std::string pci_bus_id;
1577 static const int kBufferSize = 64;
1578 absl::InlinedVector<char, 4> chars(kBufferSize);
1579 chars[kBufferSize - 1] = '\0';
1580 CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1581 if (res != CUDA_SUCCESS) {
1582 LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1583 return pci_bus_id;
1584 }
1585 pci_bus_id = chars.begin();
1586 return pci_bus_id;
1587 }
1588
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1589 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1590 GpuContext* to) {
1591 if (from == to) {
1592 return true; // A context can always access its own memory.
1593 }
1594
1595 auto from_device = DeviceFromContext(from);
1596 if (!from_device.ok()) {
1597 LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1598 << from_device.status();
1599 return false;
1600 }
1601 auto to_device = DeviceFromContext(to);
1602 if (!to_device.ok()) {
1603 LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1604 << to_device.status();
1605 return false;
1606 }
1607 return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
1608 }
1609
CanEnablePeerAccess(GpuDeviceHandle from,GpuDeviceHandle to)1610 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
1611 GpuDeviceHandle to) {
1612 int can_access_peer = -1;
1613 CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
1614 if (result != CUDA_SUCCESS) {
1615 LOG(ERROR) << "failed to detect peer access capability: "
1616 << ToString(result);
1617 return false;
1618 }
1619 return can_access_peer;
1620 }
1621
EnablePeerAccess(GpuContext * from,GpuContext * to)1622 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1623 GpuContext* to) {
1624 if (from == to) {
1625 return port::Status::OK(); // A context can always access its own memory.
1626 }
1627
1628 ScopedActivateContext activated{from};
1629 CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1630 if (result != CUDA_SUCCESS &&
1631 result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1632 return port::Status(
1633 port::error::INTERNAL,
1634 absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
1635 to, ToString(result)));
1636 }
1637
1638 return port::Status::OK();
1639 }
1640
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1641 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1642 GpuContext* context, CUfunction kernel, int threads_per_block,
1643 size_t dynamic_shared_memory_bytes) {
1644 ScopedActivateContext activation(context);
1645
1646 int max_blocks;
1647 RETURN_IF_CUDA_RES_ERROR(
1648 cuOccupancyMaxActiveBlocksPerMultiprocessor(
1649 &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes),
1650 absl::StrFormat("Failed to calculate occupancy of kernel %p", kernel));
1651 return max_blocks;
1652 }
1653
1654 } // namespace gpu
1655
1656 namespace cuda {
1657
CurrentContextOrDie()1658 CUcontext CurrentContextOrDie() {
1659 CUcontext current = nullptr;
1660 FAIL_IF_CUDA_RES_ERROR(cuCtxGetCurrent(¤t),
1661 "Failed to query current context");
1662 return current;
1663 }
1664
1665 } // namespace cuda
1666 } // namespace stream_executor
1667