1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
17
18 #include <stdint.h>
19 #include <stdlib.h>
20
21 #include <map>
22 #include <set>
23 #include <utility>
24
25 #include "absl/base/casts.h"
26 #include "absl/base/const_init.h"
27 #include "absl/container/inlined_vector.h"
28 #include "absl/debugging/leak_check.h"
29 #include "absl/memory/memory.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/synchronization/mutex.h"
33 #include "absl/synchronization/notification.h"
34 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
35 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
36 #include "tensorflow/stream_executor/lib/env.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/human_readable.h"
39 #include "tensorflow/stream_executor/lib/stacktrace.h"
40 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
41 #include "tensorflow/stream_executor/lib/threadpool.h"
42 #include "tensorflow/stream_executor/platform/logging.h"
43 #include "tensorflow/stream_executor/platform/port.h"
44
45 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
46 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
47 bool FLAGS_gpuexec_cuda_device_0_only = false;
48
49 #define RETURN_IF_CUDA_RES_ERROR(expr, ...) \
50 do { \
51 CUresult _res = (expr); \
52 if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) { \
53 return port::InternalError(absl::StrCat( \
54 __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
55 } \
56 } while (0)
57
58 #define FAIL_IF_CUDA_RES_ERROR(expr, ...) \
59 do { \
60 CUresult _res = (expr); \
61 if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) { \
62 LOG(FATAL) << absl::StrCat(__VA_ARGS__) << ": " \
63 << ::stream_executor::gpu::ToString(_res); \
64 } \
65 } while (0)
66
67 // Debugging: on each push and pop of a cuda context, verify the current context
68 // matches the expected one.
69 constexpr bool kVerifyGpuContext = false;
70
71 namespace stream_executor {
72 namespace gpu {
73 namespace {
74
75 // Manages the singleton map of contexts that we've created, mapping
76 // from the CUcontext to the GpuContext* that we pass around internally.
77 // This also manages assignment of unique ids to GpuContexts, to allow
78 // for fast comparison of a context against the current context.
79 //
80 // CUDA-runtime-created contexts are avoided, if triple angle
81 // brace launches are required, by using the scoped activations in
82 // gpu/gpu_activation.h.
83 class CreatedContexts {
84 public:
85 // Returns whether context is a member of the live set.
Has(CUcontext context)86 static bool Has(CUcontext context) {
87 absl::ReaderMutexLock lock(&mu_);
88 return Live()->find(context) != Live()->end();
89 }
90
91 // Adds context to the live set, or returns it if it's already present.
Add(CUcontext context)92 static GpuContext* Add(CUcontext context) {
93 CHECK(context != nullptr);
94 absl::MutexLock lock(&mu_);
95 auto insert_result = Live()->insert(std::make_pair(context, nullptr));
96 auto it = insert_result.first;
97 if (insert_result.second) {
98 // context was not present in the map. Add it.
99 it->second = absl::make_unique<GpuContext>(context, next_id_++);
100 }
101 return it->second.get();
102 }
103
104 // Removes context from the live set.
Remove(CUcontext context)105 static void Remove(CUcontext context) {
106 CHECK(context != nullptr);
107 absl::MutexLock lock(&mu_);
108 auto it = Live()->find(context);
109 CHECK(it != Live()->end()) << context;
110 Live()->erase(it);
111 }
112
113 private:
114 // Returns the live map singleton.
Live()115 static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
116 static auto singleton =
117 new std::map<CUcontext, std::unique_ptr<GpuContext>>;
118 return singleton;
119 }
120
121 // Lock that guards access-to/mutation-of the live set.
122 static absl::Mutex mu_;
123 static int64 next_id_;
124 };
125
126 /* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
127 /* static */ int64 CreatedContexts::next_id_ = 1; // 0 means "no context"
128
129 // Formats CUresult to output prettified values into a log stream.
ToString(CUresult result)130 std::string ToString(CUresult result) {
131 const char* error_name;
132 if (cuGetErrorName(result, &error_name)) {
133 return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
134 }
135 const char* error_string;
136 if (cuGetErrorString(result, &error_string)) {
137 return error_name;
138 }
139 return absl::StrCat(error_name, ": ", error_string);
140 }
141
142 // Returns the current context and checks that it is in the set of CUDA contexts
143 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
144 // context behind our backs).
CurrentContext()145 CUcontext CurrentContext() {
146 CUcontext current = cuda::CurrentContextOrDie();
147 if (current != nullptr && !CreatedContexts::Has(current)) {
148 LOG(FATAL) << "current context was not created by the StreamExecutor "
149 "cuda_driver API: "
150 << current
151 << "; a CUDA runtime call "
152 "was likely performed without using a StreamExecutor context";
153 }
154 return current;
155 }
156
157 // CUDA driver routines may require a large amount of stack (particularly
158 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
159 // stack-limited threads (such as those spawned by a default-argument
160 // thread::ThreadPool on some platforms), we run certain routines in this pool
161 // and wait for completion.
GetDriverExecutor()162 port::ThreadPool* GetDriverExecutor() {
163 static port::ThreadPool* thread_pool = new port::ThreadPool(
164 port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
165 return thread_pool;
166 }
167
168 } // namespace
169
MemorySpaceString(MemorySpace memory_space)170 std::string MemorySpaceString(MemorySpace memory_space) {
171 switch (memory_space) {
172 case MemorySpace::kHost:
173 return "host";
174 case MemorySpace::kDevice:
175 return "device";
176 default:
177 LOG(FATAL) << "impossible memory space";
178 }
179 }
180
181 namespace {
182
183 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()184 void SynchronizeOrDie() {
185 FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
186 "Synchronize fail: ", port::CurrentStackTrace());
187 }
188
189 struct ThreadLocalData {
190 int64 id;
191 GpuContext* context; // Only valid if id == a known good context.
192 int depth;
193 };
194
195 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
196
197 } // namespace
198
ScopedActivateContext(GpuContext * cuda_context)199 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
200 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
201
202 auto* tls = &tls_data.get();
203
204 // If this is an outermost scope, we must not assume that the CUDA context has
205 // been left in the same state we left it. Other code may have run on this
206 // thread and altered the context.
207 if (tls->depth == 0) {
208 VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
209 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
210 "Failed setting context");
211 tls->depth = 1;
212 tls->id = cuda_context->id();
213 tls->context = cuda_context;
214 to_restore_ = nullptr;
215 return;
216 }
217
218 tls->depth++;
219 if (tls->id == cuda_context->id()) {
220 if (kVerifyGpuContext) {
221 CHECK_EQ(CurrentContext(), cuda_context->context());
222 }
223 DCHECK_EQ(CurrentContext(), cuda_context->context());
224 return;
225 }
226
227 VLOG(3) << "ScopedActivateContext switching context from " << tls->id
228 << " to " << cuda_context->id();
229
230 to_restore_ = tls->context;
231 // Set the context and update thread local.
232 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
233 "Failed setting context");
234 tls->id = cuda_context->id();
235 tls->context = cuda_context;
236 }
237
~ScopedActivateContext()238 ScopedActivateContext::~ScopedActivateContext() {
239 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
240
241 auto* tls = &tls_data.get();
242
243 if (kVerifyGpuContext) {
244 // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
245 // possible this could fail in the CurrentContext() call.
246 CHECK_EQ(CurrentContext(),
247 tls->context == nullptr ? nullptr : tls->context->context());
248 }
249
250 tls->depth--;
251 DCHECK_GE(tls->depth, 0);
252 if (to_restore_ == nullptr) {
253 // Leave context, tls->id, and tls->context set.
254 return;
255 }
256
257 // Set context and update thread local.
258 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(to_restore_->context()),
259 "Failed setting context");
260 tls->id = to_restore_->id();
261 tls->context = to_restore_;
262 }
263
264 namespace {
265
266 // Returns a stringified device number associated with pointer, primarily for
267 // logging purposes. Returns "?" if the device could not be successfully
268 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)269 std::string CUDAPointerToDeviceString(CUdeviceptr pointer) {
270 auto value = GpuDriver::GetPointerDevice(pointer);
271 if (value.ok()) {
272 return absl::StrCat(value.ValueOrDie());
273 }
274 LOG(ERROR) << "could not query device: " << value.status();
275 return "?";
276 }
277
278 // Returns a stringified memory space associated with pointer, primarily for
279 // logging purposes. Returns "?" if the memory space could not be successfully
280 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)281 std::string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
282 auto value = GpuDriver::GetPointerMemorySpace(pointer);
283 if (value.ok()) {
284 return MemorySpaceString(value.ValueOrDie());
285 }
286 LOG(ERROR) << "could not query device: " << value.status();
287 return "?";
288 }
289
290 // Returns a stringified representation of whether or not peer access is
291 // permitted between the "from" and "to" pointers' associated contexts,
292 // primarily for logging purposes. Returns "error" if an error is encountered
293 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)294 std::string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
295 auto from_context = GpuDriver::GetPointerContext(from);
296 if (!from_context.ok()) {
297 LOG(ERROR) << "could not retrieve source pointer's context: "
298 << from_context.status();
299 return "error";
300 }
301 auto to_context = GpuDriver::GetPointerContext(to);
302 if (!to_context.ok()) {
303 LOG(ERROR) << "could not retrieve destination pointer's context: "
304 << to_context.status();
305 return "error";
306 }
307 return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
308 to_context.ValueOrDie())
309 ? "true"
310 : "false";
311 }
312
313 // Actually performs the work of CUDA initialization. Wrapped up in one-time
314 // execution guard.
InternalInit()315 static port::Status InternalInit() {
316 CUresult res = CUDA_ERROR_NO_DEVICE;
317 if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
318 LOG(ERROR) << "injecting CUDA init error; initialization will fail";
319 } else {
320 res = cuInit(0 /* = flags */);
321 }
322
323 if (res == CUDA_SUCCESS) {
324 return port::Status::OK();
325 } else if (res == CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) {
326 LOG(WARNING) << "failed call to cuInit: " << ToString(res);
327 } else {
328 LOG(ERROR) << "failed call to cuInit: " << ToString(res);
329 }
330
331 Diagnostician::LogDiagnosticInformation();
332 return port::Status(port::error::ABORTED,
333 absl::StrCat("failed call to cuInit: ", ToString(res)));
334 }
335
336 } // namespace
337
Init()338 /* static */ port::Status GpuDriver::Init() {
339 // Cached return value from calling InternalInit(), as cuInit need only be
340 // called once, but GpuDriver::Init may be called many times.
341 static port::Status* init_retval = [] {
342 return new port::Status(InternalInit());
343 }();
344 return *init_retval;
345 }
346
GetDevice(int device_ordinal,CUdevice * device)347 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
348 CUdevice* device) {
349 RETURN_IF_CUDA_RES_ERROR(cuDeviceGet(device, device_ordinal),
350 "Failed call to cuDeviceGet");
351 return port::Status::OK();
352 }
353
GetDeviceName(CUdevice device,std::string * device_name)354 /* static */ port::Status GpuDriver::GetDeviceName(CUdevice device,
355 std::string* device_name) {
356 static const size_t kCharLimit = 64;
357 absl::InlinedVector<char, 4> chars(kCharLimit);
358 RETURN_IF_CUDA_RES_ERROR(
359 cuDeviceGetName(chars.begin(), kCharLimit - 1, device),
360 "Failed to get device name");
361 chars[kCharLimit - 1] = '\0';
362 *device_name = chars.begin();
363 return port::Status::OK();
364 }
365
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)366 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
367 int* flags) {
368 static_assert(DeviceOptions::kMask == 0xf,
369 "needs update for new device options");
370
371 if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
372 *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
373 }
374
375 // If no flags are set the default is CU_CTX_SCHED_AUTO, which
376 // in Google environments is very likely to mean SPIN.
377 if (device_options.flags() & DeviceOptions::kScheduleSpin) {
378 *flags |= CU_CTX_SCHED_SPIN;
379 }
380 if (device_options.flags() & DeviceOptions::kScheduleYield) {
381 *flags |= CU_CTX_SCHED_YIELD;
382 }
383 if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
384 *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
385 }
386
387 return true;
388 }
389
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)390 /* static */ port::Status GpuDriver::CreateContext(
391 int device_ordinal, CUdevice device, const DeviceOptions& device_options,
392 GpuContext** context) {
393 *context = nullptr;
394
395 int flags = 0;
396 if (!DeviceOptionsToContextFlags(device_options, &flags)) {
397 LOG(WARNING) << "could not convert all device options into context flags";
398 }
399
400 CUresult res;
401 CUcontext former_context;
402 CUcontext new_context;
403
404 unsigned int former_primary_context_flags;
405 int former_primary_context_is_active;
406 CHECK_EQ(CUDA_SUCCESS,
407 cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
408 &former_primary_context_is_active));
409 if (former_primary_context_flags != flags) {
410 if (former_primary_context_is_active) {
411 LOG(ERROR)
412 << "The primary context is active and has a different flag set ("
413 << former_primary_context_flags << ") than the desired flag set ("
414 << flags << ").";
415 } else {
416 CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
417 }
418 }
419
420 former_context = cuda::CurrentContextOrDie();
421 res = cuDevicePrimaryCtxRetain(&new_context, device);
422 if (former_context != nullptr) {
423 CUdevice former_device;
424 if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
425 if (former_device == device) {
426 if (former_context == new_context) {
427 VLOG(2) << "The primary context " << former_context << " for device "
428 << device
429 << " exists before initializing the StreamExecutor.";
430 } else {
431 LOG(WARNING) << "A non-primary context " << former_context
432 << " for device " << device
433 << " exists before initializing the StreamExecutor. The "
434 << "primary context is now " << new_context << ". We "
435 << "haven't verified StreamExecutor works with that.";
436 }
437 }
438 } else {
439 LOG(ERROR) << "Failed to get the device of the current context "
440 << former_context;
441 }
442 }
443 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
444
445 if (res == CUDA_SUCCESS) {
446 *context = CreatedContexts::Add(new_context);
447 CHECK(*context != nullptr)
448 << "success in this call must entail non-null result";
449 VLOG(2) << "created or reused context " << new_context
450 << " for this thread";
451 return port::Status::OK();
452 }
453
454 std::string message =
455 "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
456 if (res == CUDA_ERROR_OUT_OF_MEMORY) {
457 uint64 total_memory;
458 if (GetDeviceTotalMemory(device, &total_memory)) {
459 absl::StrAppend(&message, "; total memory reported: ", total_memory);
460 } else {
461 absl::StrAppend(&message, "; could not query total memory");
462 }
463 }
464
465 return port::Status(port::error::INTERNAL, message);
466 }
467
DestroyContext(GpuContext * context)468 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
469 if (context == nullptr) {
470 return;
471 }
472 CUcontext former_context = CurrentContext();
473 CUresult res = cuCtxSetCurrent(context->context());
474 CUdevice device;
475 cuCtxGetDevice(&device);
476 cuCtxSetCurrent(former_context);
477
478 res = cuDevicePrimaryCtxRelease(device);
479
480 if (res != CUDA_SUCCESS) {
481 LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
482 }
483
484 CreatedContexts::Remove(context->context());
485 }
486
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)487 /* static */ port::Status GpuDriver::FuncGetAttribute(
488 CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
489 RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
490 "Failed to query kernel attribute: ", attribute);
491 return port::Status::OK();
492 }
493
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)494 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
495 CUfunction function, CUfunc_cache cache_config) {
496 RETURN_IF_CUDA_RES_ERROR(cuFuncSetCacheConfig(function, cache_config),
497 "Failed to set CUDA kernel cache config");
498 return port::Status::OK();
499 }
500
501 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)502 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
503 CUsharedconfig shared_mem_config;
504 ScopedActivateContext activation(context);
505 RETURN_IF_CUDA_RES_ERROR(cuCtxGetSharedMemConfig(&shared_mem_config),
506 "Failed to get shared memory config");
507 return shared_mem_config;
508 }
509
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)510 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
511 GpuContext* context, CUsharedconfig shared_mem_config) {
512 ScopedActivateContext activation(context);
513 RETURN_IF_CUDA_RES_ERROR(cuCtxSetSharedMemConfig(shared_mem_config),
514 "Failed to set shared memory config");
515 return port::Status::OK();
516 }
517
LaunchKernel(GpuContext * context,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)518 /* static */ port::Status GpuDriver::LaunchKernel(
519 GpuContext* context, CUfunction function, unsigned int grid_dim_x,
520 unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
521 unsigned int block_dim_y, unsigned int block_dim_z,
522 unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
523 void** extra) {
524 ScopedActivateContext activation(context);
525 VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
526 << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
527 << " bdx: " << block_dim_x << " bdy: " << block_dim_y
528 << " bdz: " << block_dim_z;
529 RETURN_IF_CUDA_RES_ERROR(
530 cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
531 block_dim_y, block_dim_z, shared_mem_bytes, stream,
532 kernel_params, extra),
533 "Failed to launch CUDA kernel");
534 return port::Status::OK();
535 }
536
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)537 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
538 const char* cubin_bytes,
539 CUmodule* module) {
540 ScopedActivateContext activation(context);
541 RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
542 "Failed to load in-memory CUBIN");
543 return port::Status::OK();
544 }
545
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)546 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
547 const char* ptx_contents,
548 CUmodule* module) {
549 absl::Notification notification;
550 port::Status ret = port::Status::OK();
551 GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
552 ¬ification]() {
553 ScopedActivateContext activation(context);
554 void* ptx_data = const_cast<char*>(ptx_contents);
555 static const unsigned int kLogBufferBytesLimit = 1024;
556 unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
557 unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
558 absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
559 absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
560 bool log_verbose = true;
561 CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
562 CU_JIT_ERROR_LOG_BUFFER,
563 CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
564 CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
565 // Note that the driver API wants the contents of this values to be stored
566 // in an array of void*s, so we coerce them accordingly.
567 void* option_values[] = {
568 absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
569 absl::bit_cast<void*>(error_log_buffer.data()),
570 absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
571 absl::bit_cast<void*>(info_log_buffer.data()),
572 absl::bit_cast<void*>(uintptr_t(log_verbose))};
573 CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
574
575 CUresult res;
576 {
577 // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
578 // module loading: see http://b/13248943
579 absl::LeakCheckDisabler disabler;
580 res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
581 option_values);
582 }
583
584 // The PTX JIT mutates the values in the option values array to reflect the
585 // size of the logs it output; now that we've made the call, read the values
586 // back out.
587 error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
588 info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
589 CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
590 CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
591
592 if (res != CUDA_SUCCESS) {
593 LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
594 // As a precaution for null termination of the API-provided value, ensure
595 // that at least the last byte is null.
596 error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
597 : 0] = '\0';
598 LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
599 << " bytes): " << error_log_buffer.data();
600 ret = port::InternalError(
601 absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
602 notification.Notify();
603 }
604
605 VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
606 << " bytes): " << info_log_buffer.data();
607 VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
608 << " bytes): " << error_log_buffer.data();
609 CHECK(module != nullptr);
610 notification.Notify();
611 });
612 notification.WaitForNotification();
613
614 return ret;
615 }
616
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)617 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
618 const char* hsaco_contents,
619 CUmodule* module) {
620 return port::InternalError(
621 "Feature not supported on CUDA platform (LoadHsaco)");
622 }
623
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)624 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
625 GpuContext* context, CUdeviceptr location, uint8 value, size_t size) {
626 ScopedActivateContext activation(context);
627 RETURN_IF_CUDA_RES_ERROR(cuMemsetD8(location, value, size),
628 "Failed to memset memory");
629 return port::Status::OK();
630 }
631
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)632 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
633 GpuContext* context, CUdeviceptr location, uint32 value,
634 size_t uint32_count) {
635 ScopedActivateContext activation(context);
636 RETURN_IF_CUDA_RES_ERROR(cuMemsetD32(location, value, uint32_count),
637 "Failed to memset memory");
638 return port::Status::OK();
639 }
640
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)641 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
642 GpuContext* context, CUdeviceptr location, uint8 value, size_t uint32_count,
643 CUstream stream) {
644 ScopedActivateContext activation(context);
645 RETURN_IF_CUDA_RES_ERROR(
646 cuMemsetD8Async(location, value, uint32_count, stream),
647 "Failed to enqueue async memset operation");
648 return port::Status::OK();
649 }
650
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)651 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
652 GpuContext* context, CUdeviceptr location, uint32 value,
653 size_t uint32_count, CUstream stream) {
654 ScopedActivateContext activation(context);
655 RETURN_IF_CUDA_RES_ERROR(
656 cuMemsetD32Async(location, value, uint32_count, stream),
657 "Failed to enqueue async memset operation");
658 return port::Status::OK();
659 }
660
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)661 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
662 CUstream stream,
663 StreamCallback callback,
664 void* data) {
665 // Note: flags param is required to be zero according to CUDA 6.0.
666 CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
667 if (res != CUDA_SUCCESS) {
668 LOG(ERROR) << "unable to add host callback: " << ToString(res);
669 return false;
670 }
671 return true;
672 }
673
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)674 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
675 CUmodule module,
676 const char* kernel_name,
677 CUfunction* function) {
678 ScopedActivateContext activated{context};
679 CHECK(module != nullptr && kernel_name != nullptr);
680 CUresult res = cuModuleGetFunction(function, module, kernel_name);
681 if (res != CUDA_SUCCESS) {
682 LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
683 << "\" from module: " << ToString(res);
684 return false;
685 }
686
687 return true;
688 }
689
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)690 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
691 CUmodule module,
692 const char* symbol_name,
693 CUdeviceptr* dptr, size_t* bytes) {
694 ScopedActivateContext activated{context};
695 CHECK(module != nullptr && symbol_name != nullptr &&
696 (dptr != nullptr || bytes != nullptr));
697 CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
698 if (res != CUDA_SUCCESS) {
699 // symbol may not be found in the current module, but it may reside in
700 // another module.
701 VLOG(2) << "failed to get symbol \"" << symbol_name
702 << "\" from module: " << ToString(res);
703 return false;
704 }
705
706 return true;
707 }
708
UnloadModule(GpuContext * context,CUmodule module)709 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
710 CUmodule module) {
711 ScopedActivateContext activated{context};
712 CUresult res = cuModuleUnload(module);
713 if (res != CUDA_SUCCESS) {
714 LOG(ERROR) << "failed to unload module " << module
715 << "; leaking: " << ToString(res);
716 }
717 }
718
DeviceFromContext(GpuContext * context)719 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
720 GpuContext* context) {
721 ScopedActivateContext activated{context};
722 CUdevice device = -1;
723 CUresult result = cuCtxGetDevice(&device);
724 if (result == CUDA_SUCCESS) {
725 return device;
726 }
727
728 return port::Status(
729 port::error::INTERNAL,
730 absl::StrCat("failed to get device for context: ", ToString(result)));
731 }
732
CreateStream(GpuContext * context,CUstream * stream,int priority)733 /* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
734 int priority) {
735 // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
736 // up synchronization with respect to memsets and any other things that have
737 // to occur on the default stream?
738 ScopedActivateContext activated{context};
739 CUresult res;
740 // If the priority is 0, then use the previous api to create the stream with
741 // the default priority for backward compatibility. Probably there is no
742 // difference in using the new api call but leaving it as is for now.
743 if (priority == 0) {
744 res = cuStreamCreate(stream, 0);
745 } else {
746 res = cuStreamCreateWithPriority(stream, 0, priority);
747 }
748 if (res != CUDA_SUCCESS) {
749 LOG(ERROR) << "could not allocate CUDA stream for context "
750 << context->context() << ": " << ToString(res);
751 return false;
752 }
753
754 VLOG(2) << "successfully created stream " << *stream << " for context "
755 << context->context() << " on thread";
756 return true;
757 }
758
DestroyStream(GpuContext * context,CUstream * stream)759 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
760 CUstream* stream) {
761 if (*stream == nullptr) {
762 return;
763 }
764
765 ScopedActivateContext activated{context};
766 CUresult res = cuStreamDestroy(*stream);
767 if (res != CUDA_SUCCESS) {
768 LOG(ERROR) << "failed to destroy CUDA stream for context "
769 << context->context() << ": " << ToString(res);
770 } else {
771 VLOG(2) << "successfully destroyed stream " << *stream << " for context "
772 << context->context();
773 *stream = nullptr;
774 }
775 }
776
DeviceAllocate(GpuContext * context,uint64 bytes)777 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
778 uint64 bytes) {
779 if (bytes == 0) {
780 return nullptr;
781 }
782
783 ScopedActivateContext activated{context};
784 CUdeviceptr result = 0;
785 CUresult res = cuMemAlloc(&result, bytes);
786 if (res != CUDA_SUCCESS) {
787 // LOG(INFO) because this isn't always important to users (e.g. BFCAllocator
788 // implements a retry if the first allocation fails).
789 LOG(INFO) << "failed to allocate "
790 << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
791 << " bytes) from device: " << ToString(res);
792 return nullptr;
793 }
794 void* ptr = reinterpret_cast<void*>(result);
795 VLOG(2) << "allocated " << ptr << " for context " << context->context()
796 << " of " << bytes << " bytes";
797 return ptr;
798 }
799
DeviceDeallocate(GpuContext * context,void * location)800 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
801 void* location) {
802 ScopedActivateContext activation(context);
803 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
804 CUresult res = cuMemFree(pointer);
805 if (res != CUDA_SUCCESS) {
806 LOG(ERROR) << "failed to free device memory at " << location
807 << "; result: " << ToString(res);
808 } else {
809 VLOG(2) << "deallocated " << location << " for context "
810 << context->context();
811 }
812 }
813
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)814 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
815 uint64 bytes) {
816 ScopedActivateContext activation(context);
817 CUdeviceptr result = 0;
818 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
819 CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
820 if (res != CUDA_SUCCESS) {
821 LOG(ERROR) << "failed to alloc " << bytes
822 << " bytes unified memory; result: " << ToString(res);
823 return nullptr;
824 }
825 void* ptr = reinterpret_cast<void*>(result);
826 VLOG(2) << "allocated " << ptr << " for context " << context->context()
827 << " of " << bytes << " bytes in unified memory";
828 return ptr;
829 }
830
UnifiedMemoryDeallocate(GpuContext * context,void * location)831 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
832 void* location) {
833 ScopedActivateContext activation(context);
834 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
835 CUresult res = cuMemFree(pointer);
836 if (res != CUDA_SUCCESS) {
837 LOG(ERROR) << "failed to free unified memory at " << location
838 << "; result: " << ToString(res);
839 } else {
840 VLOG(2) << "deallocated unified memory at " << location << " for context "
841 << context->context();
842 }
843 }
844
HostAllocate(GpuContext * context,uint64 bytes)845 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
846 ScopedActivateContext activation(context);
847 void* host_mem = nullptr;
848 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
849 CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
850 if (res != CUDA_SUCCESS) {
851 LOG(ERROR) << "failed to alloc " << bytes
852 << " bytes on host: " << ToString(res);
853 }
854 return host_mem;
855 }
856
HostDeallocate(GpuContext * context,void * location)857 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
858 void* location) {
859 ScopedActivateContext activation(context);
860 CUresult res = cuMemFreeHost(location);
861 if (res != CUDA_SUCCESS) {
862 LOG(ERROR) << "error deallocating host memory at " << location << ": "
863 << ToString(res);
864 }
865 }
866
HostRegister(GpuContext * context,void * location,uint64 bytes)867 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
868 uint64 bytes) {
869 ScopedActivateContext activation(context);
870 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
871 CUresult res =
872 cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
873 if (res != CUDA_SUCCESS) {
874 LOG(ERROR) << "error registering host memory at " << location << ": "
875 << ToString(res);
876 return false;
877 }
878 return true;
879 }
880
HostUnregister(GpuContext * context,void * location)881 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
882 void* location) {
883 ScopedActivateContext activation(context);
884 CUresult res = cuMemHostUnregister(location);
885 if (res != CUDA_SUCCESS) {
886 LOG(ERROR) << "error unregistering host memory at " << location << ": "
887 << ToString(res);
888 return false;
889 }
890 return true;
891 }
892
893 #if CUDA_VERSION >= 10020
894 /* static */ port::StatusOr<GpuDriver::VmemSpan>
ReserveVirtualMemory(GpuContext * context,uint64 bytes)895 GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
896 ScopedActivateContext activation(context);
897 CUdeviceptr base;
898 CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
899 /*addr=*/0, /*flags=*/0);
900 if (res != CUDA_SUCCESS) {
901 return port::InternalError(
902 absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
903 bytes, ToString(res)));
904 }
905 return {{base, bytes}};
906 }
907
FreeVirtualMemory(GpuContext * context,GpuDriver::VmemSpan reservation)908 /* static */ void GpuDriver::FreeVirtualMemory(
909 GpuContext* context, GpuDriver::VmemSpan reservation) {
910 ScopedActivateContext activation(context);
911 CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
912 if (res != CUDA_SUCCESS) {
913 LOG(ERROR) << "error freeing vmem reservation of size "
914 << reservation.size_bytes << " at address " << reservation.base;
915 }
916 }
917
GetMinAllocationGranularity(GpuDeviceHandle device)918 /* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
919 GpuDeviceHandle device) {
920 CUmemAllocationProp props = {};
921 props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
922 props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
923 props.location.id = device;
924
925 size_t granularity;
926 CUresult res = cuMemGetAllocationGranularity(
927 &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
928 if (res != CUDA_SUCCESS) {
929 return port::InternalError(absl::StrCat(
930 "failed to get min allocation granularity: ", ToString(res)));
931 }
932 return granularity;
933 }
934
935 /* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
CreateMemoryHandle(GpuContext * context,uint64 bytes)936 GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
937 ScopedActivateContext activation(context);
938 auto device = DeviceFromContext(context);
939 if (!device.ok()) {
940 LOG(ERROR) << "Failed to get device from context" << device.status();
941 return device.status();
942 }
943
944 CUmemAllocationProp props = {};
945 props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
946 props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
947 props.location.id = device.ValueOrDie();
948
949 CUmemGenericAllocationHandle mem_handle;
950 CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
951 if (res != CUDA_SUCCESS) {
952 return port::InternalError(
953 absl::StrFormat("failed to create memory allocation of size %d: %s",
954 bytes, ToString(res)));
955 }
956 return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
957 }
958
ReleaseMemoryHandle(GpuContext * context,GpuDriver::GenericMemoryHandle handle)959 /* static */ void GpuDriver::ReleaseMemoryHandle(
960 GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
961 ScopedActivateContext activation(context);
962
963 CUresult res = cuMemRelease(handle.handle);
964 if (res != CUDA_SUCCESS) {
965 LOG(ERROR) << "Failed to release memory handle " << handle.handle
966 << " of size " << handle.bytes << ": " << ToString(res);
967 }
968 }
969
MapMemory(GpuContext * context,CUdeviceptr va,const GpuDriver::GenericMemoryHandle & handle,const std::vector<GpuDeviceHandle> & device_handles)970 /* static */ port::Status GpuDriver::MapMemory(
971 GpuContext* context, CUdeviceptr va,
972 const GpuDriver::GenericMemoryHandle& handle,
973 const std::vector<GpuDeviceHandle>& device_handles) {
974 ScopedActivateContext activation(context);
975
976 auto device = DeviceFromContext(context);
977 if (!device.ok()) {
978 return device.status();
979 }
980
981 // NB: Zero is the only valid value for both flags and offset.
982 CUresult res =
983 cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
984 if (res != CUDA_SUCCESS) {
985 return port::InternalError(absl::StrFormat(
986 "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
987 }
988
989 std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
990 for (int i = 0; i < access_descriptors.size(); ++i) {
991 access_descriptors[i].location.id = device_handles[i];
992 access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
993 access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
994 }
995
996 res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
997 access_descriptors.size());
998 if (res != CUDA_SUCCESS) {
999 // Unmap the memory that we failed to set access for.
1000 if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
1001 LOG(ERROR)
1002 << "Failed to unmap memory in GpuDriver::MapMemory error path.";
1003 }
1004 return port::InternalError(absl::StrFormat(
1005 "Failed to set read/write access on memory mapped at %d: %s", va,
1006 ToString(res)));
1007 }
1008 return port::Status::OK();
1009 }
1010
UnmapMemory(GpuContext * context,CUdeviceptr va,uint64 bytes)1011 /* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
1012 uint64 bytes) {
1013 ScopedActivateContext activation(context);
1014
1015 CUresult res = cuMemUnmap(va, bytes);
1016 if (res != CUDA_SUCCESS) {
1017 LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
1018 << ": " << ToString(res);
1019 }
1020 }
1021
1022 #endif
1023
DestroyEvent(GpuContext * context,CUevent * event)1024 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
1025 CUevent* event) {
1026 if (*event == nullptr) {
1027 return port::Status(port::error::INVALID_ARGUMENT,
1028 "input event cannot be null");
1029 }
1030
1031 ScopedActivateContext activated{context};
1032 RETURN_IF_CUDA_RES_ERROR(cuEventDestroy(*event),
1033 "Error destroying CUDA event");
1034 return port::Status::OK();
1035 }
1036
RecordEvent(GpuContext * context,CUevent event,CUstream stream)1037 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
1038 CUevent event,
1039 CUstream stream) {
1040 ScopedActivateContext activated{context};
1041 RETURN_IF_CUDA_RES_ERROR(cuEventRecord(event, stream),
1042 "Error recording CUDA event");
1043 return port::Status::OK();
1044 }
1045
QueryEvent(GpuContext * context,CUevent event)1046 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
1047 CUevent event) {
1048 ScopedActivateContext activated{context};
1049 CUresult res = cuEventQuery(event);
1050 if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
1051 return port::Status(
1052 port::error::INTERNAL,
1053 absl::StrFormat("failed to query event: %s", ToString(res)));
1054 }
1055
1056 return res;
1057 }
1058
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1059 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1060 float* elapsed_milliseconds,
1061 CUevent start, CUevent stop) {
1062 ScopedActivateContext activated{context};
1063 // The stop event must have completed in order for cuEventElapsedTime to
1064 // work.
1065 CUresult res = cuEventSynchronize(stop);
1066 if (res != CUDA_SUCCESS) {
1067 LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1068 return false;
1069 }
1070 res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1071 if (res != CUDA_SUCCESS) {
1072 LOG(ERROR) << "failed to get elapsed time between events: "
1073 << ToString(res);
1074 return false;
1075 }
1076
1077 return true;
1078 }
1079
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1080 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1081 CUstream stream, CUevent event) {
1082 ScopedActivateContext activation(context);
1083 CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1084 if (res != CUDA_SUCCESS) {
1085 LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1086 return false;
1087 }
1088
1089 return true;
1090 }
1091
SynchronizeContext(GpuContext * context)1092 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1093 ScopedActivateContext activation(context);
1094 CUresult res = cuCtxSynchronize();
1095 if (res != CUDA_SUCCESS) {
1096 LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1097 << " :: " << port::CurrentStackTrace();
1098 return false;
1099 }
1100
1101 return true;
1102 }
1103
SynchronizeStream(GpuContext * context,CUstream stream)1104 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1105 CUstream stream) {
1106 ScopedActivateContext activated{context};
1107 CHECK(stream != nullptr);
1108 RETURN_IF_CUDA_RES_ERROR(cuStreamSynchronize(stream),
1109 "Could not synchronize CUDA stream");
1110 return port::Status::OK();
1111 }
1112
IsStreamIdle(GpuContext * context,CUstream stream)1113 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1114 CUstream stream) {
1115 ScopedActivateContext activated{context};
1116 CHECK(stream != nullptr);
1117 CUresult res = cuStreamQuery(stream);
1118 if (res == CUDA_SUCCESS) {
1119 return true;
1120 }
1121
1122 if (res != CUDA_ERROR_NOT_READY) {
1123 LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1124 }
1125 return false;
1126 }
1127
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size)1128 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1129 void* host_dst,
1130 CUdeviceptr gpu_src,
1131 uint64 size) {
1132 ScopedActivateContext activation(context);
1133 RETURN_IF_CUDA_RES_ERROR(
1134 cuMemcpyDtoH(host_dst, gpu_src, size),
1135 absl::StrFormat("failed to synchronous memcpy from device to host "
1136 "host dst: %p; GPU src: %p; size: %u=0x%x",
1137 host_dst, absl::bit_cast<void*>(gpu_src), size, size));
1138 VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1139 << host_dst;
1140 return port::Status::OK();
1141 }
1142
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size)1143 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1144 CUdeviceptr gpu_dst,
1145 const void* host_src,
1146 uint64 size) {
1147 ScopedActivateContext activation(context);
1148 RETURN_IF_CUDA_RES_ERROR(
1149 cuMemcpyHtoD(gpu_dst, host_src, size),
1150 absl::StrFormat(
1151 "failed to synchronous memcpy from host to device: GPU dst: %p;"
1152 " host src: %p; size: %u=0x%x",
1153 absl::bit_cast<void*>(gpu_dst), host_src, size, size));
1154 VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1155 return port::Status::OK();
1156 }
1157
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size)1158 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1159 CUdeviceptr gpu_dst,
1160 CUdeviceptr gpu_src,
1161 uint64 size) {
1162 ScopedActivateContext activation(context);
1163 RETURN_IF_CUDA_RES_ERROR(
1164 cuMemcpyDtoD(gpu_dst, gpu_src, size),
1165 absl::StrFormat(
1166 "failed to synchronous memcpy from host to device: GPU dst: %p; "
1167 "GPU src: %p; size: %u=0x%x",
1168 absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
1169 size));
1170 VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1171 return port::Status::OK();
1172 }
1173
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1174 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1175 void* host_dst,
1176 CUdeviceptr gpu_src,
1177 uint64 size,
1178 CUstream stream) {
1179 ScopedActivateContext activation(context);
1180 CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1181 if (res != CUDA_SUCCESS) {
1182 LOG(ERROR) << absl::StrFormat(
1183 "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1184 "GPU src: %p; size: %u=0x%x",
1185 ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size);
1186 return false;
1187 }
1188 VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1189 << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1190 << host_dst << " on stream " << stream;
1191 return true;
1192 }
1193
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size,CUstream stream)1194 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1195 CUdeviceptr gpu_dst,
1196 const void* host_src,
1197 uint64 size,
1198 CUstream stream) {
1199 ScopedActivateContext activation(context);
1200 CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1201 if (res != CUDA_SUCCESS) {
1202 LOG(ERROR) << absl::StrFormat(
1203 "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1204 "host src: %p; size: %u=0x%x",
1205 ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size);
1206 return false;
1207 }
1208 VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1209 << " on stream " << stream;
1210 return true;
1211 }
1212
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1213 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1214 CUdeviceptr gpu_dst,
1215 CUdeviceptr gpu_src,
1216 uint64 size,
1217 CUstream stream) {
1218 ScopedActivateContext activation(context);
1219 CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1220 if (result != CUDA_SUCCESS) {
1221 LOG(ERROR) << absl::StrFormat(
1222 "failed to enqueue async memcpy from device to device: %s"
1223 "; GPU dst: %p on %s %s"
1224 "; GPU src: %p on %s %s"
1225 "; can access? %s; size: %u=0x%x",
1226 ToString(result), absl::bit_cast<void*>(gpu_dst),
1227 CUDAPointerToMemorySpaceString(gpu_dst),
1228 CUDAPointerToDeviceString(gpu_dst), absl::bit_cast<void*>(gpu_src),
1229 CUDAPointerToMemorySpaceString(gpu_src),
1230 CUDAPointerToDeviceString(gpu_src),
1231 CUDAPointersToCanAccessString(gpu_src, gpu_dst), size, size);
1232
1233 return false;
1234 }
1235 VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1236 return true;
1237 }
1238
InitEvent(GpuContext * context,CUevent * result,EventFlags flags)1239 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
1240 CUevent* result,
1241 EventFlags flags) {
1242 int cuflags;
1243 switch (flags) {
1244 case EventFlags::kDefault:
1245 cuflags = CU_EVENT_DEFAULT;
1246 break;
1247 case EventFlags::kDisableTiming:
1248 cuflags = CU_EVENT_DISABLE_TIMING;
1249 break;
1250 default:
1251 LOG(FATAL) << "impossible event flags: " << int(flags);
1252 }
1253
1254 ScopedActivateContext activated{context};
1255 CUresult res = cuEventCreate(result, cuflags);
1256
1257 if (res == CUDA_SUCCESS) {
1258 return port::Status::OK();
1259 } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1260 return port::Status(port::error::RESOURCE_EXHAUSTED,
1261 "could not create CUDA event: out of device memory");
1262 } else {
1263 return port::Status(
1264 port::error::FAILED_PRECONDITION,
1265 absl::StrCat("could not create CUDA event: ", ToString(res)));
1266 }
1267 }
1268
GetDeviceCount()1269 /* static */ int GpuDriver::GetDeviceCount() {
1270 int device_count = 0;
1271 CUresult res = cuDeviceGetCount(&device_count);
1272 if (res != CUDA_SUCCESS) {
1273 LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1274 return 0;
1275 }
1276
1277 if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1278 device_count = 1;
1279 }
1280 return device_count;
1281 }
1282
GetPointerContext(CUdeviceptr pointer)1283 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1284 CUdeviceptr pointer) {
1285 GpuContext* context = nullptr;
1286 CUresult result =
1287 cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1288 if (result == CUDA_SUCCESS) {
1289 CHECK(context != nullptr) << "success should entail non-null context";
1290 return context;
1291 }
1292
1293 return port::Status(
1294 port::error::INTERNAL,
1295 absl::StrCat("failed to query device pointer for context: ",
1296 ToString(result)));
1297 }
1298
GetPointerMemorySpace(CUdeviceptr pointer)1299 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1300 CUdeviceptr pointer) {
1301 unsigned int value;
1302 CUresult result =
1303 cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1304 if (result == CUDA_SUCCESS) {
1305 switch (value) {
1306 case CU_MEMORYTYPE_DEVICE:
1307 return MemorySpace::kDevice;
1308 case CU_MEMORYTYPE_HOST:
1309 return MemorySpace::kHost;
1310 default:
1311 return port::Status(
1312 port::error::INTERNAL,
1313 absl::StrCat("unknown memory space provided by CUDA API: ", value));
1314 }
1315 }
1316
1317 return port::Status(
1318 port::error::INTERNAL,
1319 absl::StrCat("failed to query device pointer for memory space: ",
1320 ToString(result)));
1321 }
1322
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1323 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1324 CUdeviceptr* base,
1325 size_t* size) {
1326 CUresult result = cuMemGetAddressRange(base, size, dptr);
1327 if (result == CUDA_SUCCESS) {
1328 return port::Status::OK();
1329 } else if (result == CUDA_ERROR_NOT_FOUND) {
1330 // We differentiate between "this pointer is unknown" (return here) and
1331 // "there was an internal error while performing this operation" (return
1332 // below).
1333 return port::Status(
1334 port::error::NOT_FOUND,
1335 absl::StrFormat("not a device pointer %p; %s",
1336 reinterpret_cast<void*>(dptr), ToString(result)));
1337 }
1338
1339 return port::Status(
1340 port::error::INTERNAL,
1341 absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1342 reinterpret_cast<void*>(dptr), ToString(result)));
1343 }
1344
GetPointerDevice(CUdeviceptr pointer)1345 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1346 CUdeviceptr pointer) {
1347 auto result = GetPointerContext(pointer);
1348 if (!result.ok()) {
1349 return result.status();
1350 }
1351
1352 return DeviceFromContext(result.ValueOrDie());
1353 }
1354
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1355 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1356 int* cc_minor,
1357 CUdevice device) {
1358 *cc_major = 0;
1359 *cc_minor = 0;
1360
1361 CUresult res = cuDeviceGetAttribute(
1362 cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1363 if (res != CUDA_SUCCESS) {
1364 return port::Status(
1365 port::error::INTERNAL,
1366 absl::StrFormat(
1367 "failed to get compute capability major for device: %s; %d",
1368 ToString(res), device));
1369 }
1370
1371 res = cuDeviceGetAttribute(
1372 cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1373 if (res != CUDA_SUCCESS) {
1374 return port::Status(
1375 port::error::INTERNAL,
1376 absl::StrFormat(
1377 "failed to get compute capability minor for device: %s; %d",
1378 ToString(res), device));
1379 }
1380
1381 return port::Status::OK();
1382 }
1383
GetGpuISAVersion(int * version,CUdevice device)1384 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1385 CUdevice device) {
1386 return port::Status{
1387 port::error::INTERNAL,
1388 "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1389 }
1390
GetGpuGCNArchName(CUdevice,std::string *)1391 /* static */ port::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
1392 return port::Status{
1393 port::error::INTERNAL,
1394 "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
1395 }
1396
1397 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1398 // T and wraps it in a StatusOr.
1399 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1400 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1401 CUdevice_attribute attribute) {
1402 int value = -1;
1403 RETURN_IF_CUDA_RES_ERROR(cuDeviceGetAttribute(&value, attribute, device),
1404 "Could not retrieve CUDA device attribute (",
1405 attribute);
1406 T converted = value;
1407 return converted;
1408 }
1409
GetMultiprocessorCount(CUdevice device)1410 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1411 CUdevice device) {
1412 return GetSimpleAttribute<int>(device,
1413 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1414 }
1415
GetMaxSharedMemoryPerCore(CUdevice device)1416 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1417 CUdevice device) {
1418 return GetSimpleAttribute<int64>(
1419 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1420 }
1421
GetMaxSharedMemoryPerBlock(CUdevice device)1422 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1423 CUdevice device) {
1424 return GetSimpleAttribute<int64>(
1425 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1426 }
1427
GetMaxThreadsPerMultiprocessor(CUdevice device)1428 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1429 CUdevice device) {
1430 return GetSimpleAttribute<int64>(
1431 device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1432 }
1433
GetMaxThreadsPerBlock(CUdevice device)1434 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1435 CUdevice device) {
1436 return GetSimpleAttribute<int64>(device,
1437 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1438 }
1439
GetMaxRegistersPerBlock(CUdevice device)1440 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1441 CUdevice device) {
1442 return GetSimpleAttribute<int64>(device,
1443 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1444 }
1445
GetThreadsPerWarp(CUdevice device)1446 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1447 CUdevice device) {
1448 return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1449 }
1450
GetGridLimits(int * x,int * y,int * z,CUdevice device)1451 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1452 CUdevice device) {
1453 int value;
1454 CUresult res =
1455 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1456 if (res != CUDA_SUCCESS) {
1457 LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1458 return false;
1459 }
1460 *x = value;
1461
1462 res =
1463 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1464 if (res != CUDA_SUCCESS) {
1465 LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1466 return false;
1467 }
1468 *y = value;
1469
1470 res =
1471 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1472 if (res != CUDA_SUCCESS) {
1473 LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1474 return false;
1475 }
1476 *z = value;
1477 return true;
1478 }
1479
GetDriverVersion(int * driver_version)1480 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1481 CUresult res = cuDriverGetVersion(driver_version);
1482 if (res != CUDA_SUCCESS) {
1483 LOG(ERROR) << "failed to query driver version: " << ToString(res);
1484 return false;
1485 }
1486
1487 return true;
1488 }
1489
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1490 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1491 int device_ordinal) {
1492 CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1493 if (res != CUDA_SUCCESS) {
1494 LOG(ERROR) << "failed to query device properties: " << ToString(res);
1495 return false;
1496 }
1497
1498 return true;
1499 }
1500
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1501 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1502 CUdevice_attribute attribute, CUdevice device) {
1503 int val;
1504 CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1505 if (res != CUDA_SUCCESS) {
1506 return port::Status(
1507 port::error::INTERNAL,
1508 absl::StrFormat("failed to get device attribute %d for device %d: %s",
1509 attribute, device, ToString(res)));
1510 }
1511 return val;
1512 }
1513
IsEccEnabled(CUdevice device,bool * result)1514 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1515 int value = -1;
1516 CUresult res =
1517 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1518 if (res != CUDA_SUCCESS) {
1519 LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1520 return false;
1521 }
1522
1523 *result = value;
1524 return true;
1525 }
1526
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1527 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1528 int64* free_out,
1529 int64* total_out) {
1530 ScopedActivateContext activation(context);
1531 size_t free = 0;
1532 size_t total = 0;
1533 CUresult res = cuMemGetInfo(&free, &total);
1534 if (res != CUDA_SUCCESS) {
1535 LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1536 return false;
1537 }
1538
1539 *free_out = free;
1540 *total_out = total;
1541 return true;
1542 }
1543
GetDeviceTotalMemory(CUdevice device,uint64 * result)1544 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1545 uint64* result) {
1546 size_t value = -1;
1547 CUresult res = cuDeviceTotalMem(&value, device);
1548 if (res != CUDA_SUCCESS) {
1549 LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1550 return false;
1551 }
1552
1553 *result = value;
1554 return true;
1555 }
1556
GetPCIBusID(CUdevice device)1557 /* static */ std::string GpuDriver::GetPCIBusID(CUdevice device) {
1558 std::string pci_bus_id;
1559 static const int kBufferSize = 64;
1560 absl::InlinedVector<char, 4> chars(kBufferSize);
1561 chars[kBufferSize - 1] = '\0';
1562 CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1563 if (res != CUDA_SUCCESS) {
1564 LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1565 return pci_bus_id;
1566 }
1567 pci_bus_id = chars.begin();
1568 return pci_bus_id;
1569 }
1570
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1571 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1572 GpuContext* to) {
1573 if (from == to) {
1574 return true; // A context can always access its own memory.
1575 }
1576
1577 auto from_device = DeviceFromContext(from);
1578 if (!from_device.ok()) {
1579 LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1580 << from_device.status();
1581 return false;
1582 }
1583 auto to_device = DeviceFromContext(to);
1584 if (!to_device.ok()) {
1585 LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1586 << to_device.status();
1587 return false;
1588 }
1589 return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
1590 }
1591
CanEnablePeerAccess(GpuDeviceHandle from,GpuDeviceHandle to)1592 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
1593 GpuDeviceHandle to) {
1594 int can_access_peer = -1;
1595 CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
1596 if (result != CUDA_SUCCESS) {
1597 LOG(ERROR) << "failed to detect peer access capability: "
1598 << ToString(result);
1599 return false;
1600 }
1601 return can_access_peer;
1602 }
1603
EnablePeerAccess(GpuContext * from,GpuContext * to)1604 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1605 GpuContext* to) {
1606 if (from == to) {
1607 return port::Status::OK(); // A context can always access its own memory.
1608 }
1609
1610 ScopedActivateContext activated{from};
1611 CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1612 if (result != CUDA_SUCCESS &&
1613 result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1614 return port::Status(
1615 port::error::INTERNAL,
1616 absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
1617 to, ToString(result)));
1618 }
1619
1620 return port::Status::OK();
1621 }
1622
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1623 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1624 GpuContext* context, CUfunction kernel, int threads_per_block,
1625 size_t dynamic_shared_memory_bytes) {
1626 ScopedActivateContext activation(context);
1627
1628 int max_blocks;
1629 RETURN_IF_CUDA_RES_ERROR(
1630 cuOccupancyMaxActiveBlocksPerMultiprocessor(
1631 &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes),
1632 absl::StrFormat("Failed to calculate occupancy of kernel %p", kernel));
1633 return max_blocks;
1634 }
1635
1636 } // namespace gpu
1637
1638 namespace cuda {
1639
CurrentContextOrDie()1640 CUcontext CurrentContextOrDie() {
1641 CUcontext current = nullptr;
1642 FAIL_IF_CUDA_RES_ERROR(cuCtxGetCurrent(¤t),
1643 "Failed to query current context");
1644 return current;
1645 }
1646
1647 } // namespace cuda
1648 } // namespace stream_executor
1649