1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
17
18 #include <stdint.h>
19 #include <stdlib.h>
20 #include <map>
21 #include <set>
22 #include <utility>
23
24 #include "absl/base/casts.h"
25 #include "absl/container/inlined_vector.h"
26 #include "absl/debugging/leak_check.h"
27 #include "absl/strings/str_cat.h"
28 #include "cuda/include/cuda_runtime_api.h"
29 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
30 #include "tensorflow/stream_executor/lib/env.h"
31 #include "tensorflow/stream_executor/lib/error.h"
32 #include "tensorflow/stream_executor/lib/human_readable.h"
33 #include "tensorflow/stream_executor/lib/notification.h"
34 #include "tensorflow/stream_executor/lib/ptr_util.h"
35 #include "tensorflow/stream_executor/lib/stacktrace.h"
36 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
37 #include "tensorflow/stream_executor/lib/stringprintf.h"
38 #include "tensorflow/stream_executor/lib/threadpool.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/mutex.h"
41 #include "tensorflow/stream_executor/platform/port.h"
42
43 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
44 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
45 bool FLAGS_gpuexec_cuda_device_0_only = false;
46
47 // Debugging: on each push and pop of a cuda context, verify the current context
48 // matches the expected one.
49 constexpr bool kVerifyGpuContext = false;
50
51 namespace stream_executor {
52 namespace gpu {
53 namespace {
54
55 // Manages the singleton map of contexts that we've created, mapping
56 // from the CUcontext to the GpuContext* that we pass around internally.
57 // This also manages assignment of unique ids to GpuContexts, to allow
58 // for fast comparison of a context against the current context.
59 //
60 // CUDA-runtime-created contexts are avoided, if triple angle
61 // brace launches are required, by using the scoped activations in
62 // gpu/gpu_activation.h.
63 class CreatedContexts {
64 public:
65 // Returns whether context is a member of the live set.
Has(CUcontext context)66 static bool Has(CUcontext context) {
67 tf_shared_lock lock(mu_);
68 return Live()->find(context) != Live()->end();
69 }
70
71 // Adds context to the live set, or returns it if it's already present.
Add(CUcontext context)72 static GpuContext* Add(CUcontext context) {
73 CHECK(context != nullptr);
74 mutex_lock lock(mu_);
75 auto insert_result = Live()->insert(std::make_pair(context, nullptr));
76 auto it = insert_result.first;
77 if (insert_result.second) {
78 // context was not present in the map. Add it.
79 it->second = MakeUnique<GpuContext>(context, next_id_++);
80 }
81 return it->second.get();
82 }
83
84 // Removes context from the live set.
Remove(CUcontext context)85 static void Remove(CUcontext context) {
86 CHECK(context != nullptr);
87 mutex_lock lock(mu_);
88 auto it = Live()->find(context);
89 CHECK(it != Live()->end()) << context;
90 Live()->erase(it);
91 }
92
93 private:
94 // Returns the live map singleton.
Live()95 static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
96 static auto singleton =
97 new std::map<CUcontext, std::unique_ptr<GpuContext>>;
98 return singleton;
99 }
100
101 // Lock that guards access-to/mutation-of the live set.
102 static mutex mu_;
103 static int64 next_id_;
104 };
105
106 /* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED};
107 /* static */ int64 CreatedContexts::next_id_ = 1; // 0 means "no context"
108
109 // Formats CUresult to output prettified values into a log stream.
ToString(CUresult result)110 string ToString(CUresult result) {
111 const char* error_name;
112 if (cuGetErrorName(result, &error_name)) {
113 return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
114 }
115 const char* error_string;
116 if (cuGetErrorString(result, &error_string)) {
117 return error_name;
118 }
119 return absl::StrCat(error_name, ": ", error_string);
120 }
121
122 // Returns the current context and checks that it is in the set of CUDA contexts
123 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
124 // context behind our backs).
CurrentContext()125 CUcontext CurrentContext() {
126 CUcontext current = cuda::CurrentContextOrDie();
127 if (current != nullptr && !CreatedContexts::Has(current)) {
128 LOG(FATAL) << "current context was not created by the StreamExecutor "
129 "cuda_driver API: "
130 << current
131 << "; a CUDA runtime call "
132 "was likely performed without using a StreamExecutor context";
133 }
134 return current;
135 }
136
137 // CUDA driver routines may require a large amount of stack (particularly
138 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
139 // stack-limited threads (such as those spawned by a default-argument
140 // thread::ThreadPool on some platforms), we run certain routines in this pool
141 // and wait for completion.
142 static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
InitializeDriverExecutor()143 static port::ThreadPool* InitializeDriverExecutor() {
144 return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
145 "cuda_driver", 1);
146 }
147
GetDriverExecutor()148 port::ThreadPool* GetDriverExecutor() {
149 mutex_lock lock(driver_executor_threadpool_mu);
150 static port::ThreadPool* thread_pool = InitializeDriverExecutor();
151 return thread_pool;
152 }
153
154 } // namespace
155
MemorySpaceString(MemorySpace memory_space)156 string MemorySpaceString(MemorySpace memory_space) {
157 switch (memory_space) {
158 case MemorySpace::kHost:
159 return "host";
160 case MemorySpace::kDevice:
161 return "device";
162 default:
163 LOG(FATAL) << "impossible memory space";
164 }
165 }
166
167 namespace {
168
169 template <typename PtrT>
PointerIsValid(const PtrT ptr)170 bool PointerIsValid(const PtrT ptr) {
171 // Checks that the pointer is to a location on the device it purports to be.
172 // PtrT is one of CUdeviceptr or void*. If it's a CUdeviceptr, then
173 // cudaPointerGetAttributes should not fail, and return a memoryType of
174 // cudaMemoryTypeDevice.
175
176 bool is_host_ptr = !std::is_same<PtrT, CUdeviceptr>::value;
177 cudaPointerAttributes attributes;
178 cudaError_t err =
179 cudaPointerGetAttributes(&attributes, reinterpret_cast<const void*>(ptr));
180 // If we failed, reset cuda error status to avoid poisoning cuda streams.
181 if (err != cudaSuccess) cudaGetLastError();
182 bool points_to_host_memory = (err == cudaErrorInvalidValue ||
183 attributes.memoryType != cudaMemoryTypeDevice);
184 return (is_host_ptr == points_to_host_memory);
185 }
186
187 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()188 void SynchronizeOrDie() {
189 auto res = cuCtxSynchronize();
190 if (res != CUDA_SUCCESS) {
191 LOG(FATAL) << "Synchronize found " << ToString(res)
192 << " :: " << port::CurrentStackTrace();
193 }
194 }
195
196 struct ThreadLocalData {
197 int64 id;
198 GpuContext* context; // Only valid if id == a known good context.
199 int depth;
200 };
201
202 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
203
204 } // namespace
205
ScopedActivateContext(GpuContext * cuda_context)206 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
207 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
208
209 auto* tls = &tls_data.get();
210 tls->depth++;
211 if (tls->id == cuda_context->id()) {
212 if (kVerifyGpuContext) {
213 CHECK_EQ(CurrentContext(), cuda_context->context());
214 }
215 DCHECK_EQ(CurrentContext(), cuda_context->context());
216 return;
217 }
218
219 VLOG(3) << "ScopedActivateContext switching context from " << tls->id
220 << " to " << cuda_context->id();
221
222 to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
223
224 // Set the context and update thread local.
225 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
226 tls->id = cuda_context->id();
227 tls->context = cuda_context;
228 }
229
~ScopedActivateContext()230 ScopedActivateContext::~ScopedActivateContext() {
231 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
232
233 auto* tls = &tls_data.get();
234
235 if (kVerifyGpuContext) {
236 // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
237 // possible this could fail in the CurrentContext() call.
238 CHECK_EQ(CurrentContext(),
239 tls->context == nullptr ? nullptr : tls->context->context());
240 }
241
242 tls->depth--;
243 DCHECK_GE(tls->depth, 0);
244 if (to_restore_ == nullptr) {
245 // Leave context, tls->id, and tls->context set.
246 return;
247 }
248
249 // Set context and update thread local.
250 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
251 tls->id = to_restore_->id();
252 tls->context = to_restore_;
253 }
254
255 namespace {
256
257 // Returns a stringified device number associated with pointer, primarily for
258 // logging purposes. Returns "?" if the device could not be successfully
259 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)260 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
261 auto value = GpuDriver::GetPointerDevice(pointer);
262 if (value.ok()) {
263 return absl::StrCat(value.ValueOrDie());
264 }
265 LOG(ERROR) << "could not query device: " << value.status();
266 return "?";
267 }
268
269 // Returns a stringified memory space associated with pointer, primarily for
270 // logging purposes. Returns "?" if the memory space could not be successfully
271 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)272 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
273 auto value = GpuDriver::GetPointerMemorySpace(pointer);
274 if (value.ok()) {
275 return MemorySpaceString(value.ValueOrDie());
276 }
277 LOG(ERROR) << "could not query device: " << value.status();
278 return "?";
279 }
280
281 // Returns a stringified representation of whether or not peer access is
282 // permitted between the "from" and "to" pointers' associated contexts,
283 // primarily for logging purposes. Returns "error" if an error is encountered
284 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)285 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
286 auto from_context = GpuDriver::GetPointerContext(from);
287 if (!from_context.ok()) {
288 LOG(ERROR) << "could not retrieve source pointer's context: "
289 << from_context.status();
290 return "error";
291 }
292 auto to_context = GpuDriver::GetPointerContext(to);
293 if (!to_context.ok()) {
294 LOG(ERROR) << "could not retrieve destination pointer's context: "
295 << to_context.status();
296 return "error";
297 }
298 return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
299 to_context.ValueOrDie())
300 ? "true"
301 : "false";
302 }
303
304 // Actually performs the work of CUDA initialization. Wrapped up in one-time
305 // execution guard.
InternalInit()306 static port::Status InternalInit() {
307 CUresult res = CUDA_ERROR_NO_DEVICE;
308 if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
309 LOG(ERROR) << "injecting CUDA init error; initialization will fail";
310 } else {
311 res = cuInit(0 /* = flags */);
312 }
313
314 if (res == CUDA_SUCCESS) {
315 return port::Status::OK();
316 }
317
318 LOG(ERROR) << "failed call to cuInit: " << ToString(res);
319 Diagnostician::LogDiagnosticInformation();
320 return port::Status(port::error::ABORTED,
321 absl::StrCat("failed call to cuInit: ", ToString(res)));
322 }
323
324 } // namespace
325
Init()326 /* static */ port::Status GpuDriver::Init() {
327 // Cached return value from calling InternalInit(), as cuInit need only be
328 // called once, but GpuDriver::Init may be called many times.
329 static port::Status init_retval;
330 static bool set = false;
331 static mutex* init_mu = new mutex;
332
333 mutex_lock lock(*init_mu);
334 if (!set) {
335 init_retval = InternalInit();
336 set = true;
337 }
338
339 return init_retval;
340 }
341
GetDevice(int device_ordinal,CUdevice * device)342 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
343 CUdevice* device) {
344 CUresult res = cuDeviceGet(device, device_ordinal);
345 if (res == CUDA_SUCCESS) {
346 return port::Status::OK();
347 }
348
349 return port::Status(
350 port::error::INTERNAL,
351 absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
352 }
353
GetDeviceName(CUdevice device,string * device_name)354 /* static */ bool GpuDriver::GetDeviceName(CUdevice device,
355 string* device_name) {
356 static const size_t kCharLimit = 64;
357 absl::InlinedVector<char, 4> chars(kCharLimit);
358 CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
359 if (res != CUDA_SUCCESS) {
360 LOG(ERROR) << "failed to get device name for " << device << ": "
361 << ToString(res);
362 return false;
363 }
364 chars[kCharLimit - 1] = '\0';
365 *device_name = chars.begin();
366 return true;
367 }
368
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)369 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
370 int* flags) {
371 static_assert(DeviceOptions::kMask == 0xf,
372 "needs update for new device options");
373
374 if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
375 *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
376 }
377
378 // If no flags are set the default is CU_CTX_SCHED_AUTO, which
379 // in Google environments is very likely to mean SPIN.
380 if (device_options.flags() & DeviceOptions::kScheduleSpin) {
381 *flags |= CU_CTX_SCHED_SPIN;
382 }
383 if (device_options.flags() & DeviceOptions::kScheduleYield) {
384 *flags |= CU_CTX_SCHED_YIELD;
385 }
386 if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
387 *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
388 }
389
390 return true;
391 }
392
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)393 /* static */ port::Status GpuDriver::CreateContext(
394 int device_ordinal, CUdevice device, const DeviceOptions& device_options,
395 GpuContext** context) {
396 *context = nullptr;
397
398 int flags = 0;
399 if (!DeviceOptionsToContextFlags(device_options, &flags)) {
400 LOG(WARNING) << "could not convert all device options into context flags";
401 }
402
403 CUresult res;
404 CUcontext former_context;
405 CUcontext new_context;
406
407 unsigned int former_primary_context_flags;
408 int former_primary_context_is_active;
409 CHECK_EQ(CUDA_SUCCESS,
410 cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
411 &former_primary_context_is_active));
412 if (former_primary_context_flags != flags) {
413 if (former_primary_context_is_active) {
414 LOG(ERROR)
415 << "The primary context is active and has a different flag set ("
416 << former_primary_context_flags << ") than the desired flag set ("
417 << flags << ").";
418 } else {
419 CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
420 }
421 }
422
423 former_context = cuda::CurrentContextOrDie();
424 res = cuDevicePrimaryCtxRetain(&new_context, device);
425 if (former_context != nullptr) {
426 CUdevice former_device;
427 if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
428 if (former_device == device) {
429 if (former_context == new_context) {
430 VLOG(2) << "The primary context " << former_context << " for device "
431 << device
432 << " exists before initializing the StreamExecutor.";
433 } else {
434 LOG(WARNING) << "A non-primary context " << former_context
435 << " for device " << device
436 << " exists before initializing the StreamExecutor. The "
437 << "primary context is now " << new_context << ". We "
438 << "haven't verified StreamExecutor works with that.";
439 }
440 }
441 } else {
442 LOG(ERROR) << "Failed to get the device of the current context "
443 << former_context;
444 }
445 }
446 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
447
448 if (res == CUDA_SUCCESS) {
449 *context = CreatedContexts::Add(new_context);
450 CHECK(*context != nullptr)
451 << "success in this call must entail non-null result";
452 VLOG(2) << "created or reused context " << new_context
453 << " for this thread";
454 return port::Status::OK();
455 }
456
457 string message = "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
458 if (res == CUDA_ERROR_OUT_OF_MEMORY) {
459 uint64 total_memory;
460 if (GetDeviceTotalMemory(device, &total_memory)) {
461 absl::StrAppend(&message, "; total memory reported: ", total_memory);
462 } else {
463 absl::StrAppend(&message, "; could not query total memory");
464 }
465 }
466
467 return port::Status(port::error::INTERNAL, message);
468 }
469
DestroyContext(GpuContext * context)470 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
471 if (context == nullptr) {
472 return;
473 }
474 CUcontext former_context = CurrentContext();
475 CUresult res = cuCtxSetCurrent(context->context());
476 CUdevice device;
477 cuCtxGetDevice(&device);
478 cuCtxSetCurrent(former_context);
479
480 res = cuDevicePrimaryCtxRelease(device);
481
482 if (res != CUDA_SUCCESS) {
483 LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
484 }
485
486 CreatedContexts::Remove(context->context());
487 }
488
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)489 /* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
490 CUfunction func,
491 int* attribute_value) {
492 CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
493 if (res != CUDA_SUCCESS) {
494 LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
495 << ", attribute: " << attribute;
496 return false;
497 }
498 return true;
499 }
500
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)501 /* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
502 CUfunc_cache cache_config) {
503 CUresult res = cuFuncSetCacheConfig(function, cache_config);
504 if (res != CUDA_SUCCESS) {
505 LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
506 << ", config: " << cache_config << ", result: " << ToString(res);
507 return false;
508 }
509
510 return true;
511 }
512
513 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)514 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
515 CUsharedconfig shared_mem_config;
516 ScopedActivateContext activation(context);
517 CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
518 if (result != CUDA_SUCCESS) {
519 CUdevice device;
520 cuCtxGetDevice(&device);
521 LOG(ERROR) << "failed to get CUDA device shared memory config. "
522 << "Context device ID: " << device
523 << ", result: " << ToString(result);
524 return port::Status(
525 port::error::INTERNAL,
526 absl::StrCat("failed to get shared memory config: ", ToString(result)));
527 }
528 return shared_mem_config;
529 }
530
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)531 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
532 GpuContext* context, CUsharedconfig shared_mem_config) {
533 ScopedActivateContext activation(context);
534 CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
535 if (result != CUDA_SUCCESS) {
536 CUdevice device;
537 cuCtxGetDevice(&device);
538 LOG(ERROR) << "failed to set CUDA device shared memory config. "
539 << "Context device ID: " << device
540 << ", config: " << shared_mem_config
541 << ", result: " << ToString(result);
542 return port::Status(
543 port::error::INTERNAL,
544 absl::StrCat("failed to set shared memory config: ", ToString(result)));
545 }
546 return port::Status::OK();
547 }
548
LaunchKernel(GpuContext * context,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)549 /* static */ bool GpuDriver::LaunchKernel(
550 GpuContext* context, CUfunction function, unsigned int grid_dim_x,
551 unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
552 unsigned int block_dim_y, unsigned int block_dim_z,
553 unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
554 void** extra) {
555 ScopedActivateContext activation(context);
556 VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
557 << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
558 << " bdx: " << block_dim_x << " bdy: " << block_dim_y
559 << " bdz: " << block_dim_z;
560 CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
561 block_dim_x, block_dim_y, block_dim_z,
562 shared_mem_bytes, stream, kernel_params, extra);
563 if (res != CUDA_SUCCESS) {
564 LOG(ERROR) << "failed to launch CUDA kernel: " << function
565 << "; result: " << ToString(res);
566 return false;
567 }
568 VLOG(2) << "successfully launched kernel";
569 return true;
570 }
571
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)572 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
573 const char* cubin_bytes,
574 CUmodule* module) {
575 ScopedActivateContext activation(context);
576 CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
577 if (result != CUDA_SUCCESS) {
578 return port::Status(port::error::INTERNAL,
579 "failed to load in-memory CUBIN: " + ToString(result));
580 }
581
582 return port::Status::OK();
583 }
584
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)585 /* static */ bool GpuDriver::LoadPtx(GpuContext* context,
586 const char* ptx_contents,
587 CUmodule* module) {
588 port::Notification notification;
589 bool ret = true;
590 GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
591 ¬ification]() {
592 ScopedActivateContext activation(context);
593 void* ptx_data = const_cast<char*>(ptx_contents);
594 static const unsigned int kLogBufferBytesLimit = 1024;
595 unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
596 unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
597 absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
598 absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
599 bool log_verbose = true;
600 CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
601 CU_JIT_ERROR_LOG_BUFFER,
602 CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
603 CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
604 // Note that the driver API wants the contents of this values to be stored
605 // in an array of void*s, so we coerce them accordingly.
606 void* option_values[] = {
607 absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
608 absl::bit_cast<void*>(error_log_buffer.data()),
609 absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
610 absl::bit_cast<void*>(info_log_buffer.data()),
611 absl::bit_cast<void*>(uintptr_t(log_verbose))};
612 CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
613
614 CUresult res;
615 {
616 // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
617 // module loading: see http://b/13248943
618 absl::LeakCheckDisabler disabler;
619 res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
620 option_values);
621 }
622
623 // The PTX JIT mutates the values in the option values array to reflect the
624 // size of the logs it output; now that we've made the call, read the values
625 // back out.
626 error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
627 info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
628 CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
629 CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
630
631 if (res != CUDA_SUCCESS) {
632 LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
633 // As a precaution for null termination of the API-provided value, ensure
634 // that at least the last byte is null.
635 error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
636 : 0] = '\0';
637 LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
638 << " bytes): " << error_log_buffer.data();
639 ret = false;
640 notification.Notify();
641 }
642
643 VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
644 << " bytes): " << info_log_buffer.data();
645 VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
646 << " bytes): " << error_log_buffer.data();
647 CHECK(module != nullptr);
648 notification.Notify();
649 });
650 notification.WaitForNotification();
651
652 return ret;
653 }
654
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)655 /* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
656 const char* hsaco_contents,
657 CUmodule* module) {
658 LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
659 return false;
660 }
661
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)662 /* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
663 CUdeviceptr location,
664 uint8 value, size_t size) {
665 ScopedActivateContext activation(context);
666 CUresult res = cuMemsetD8(location, value, size);
667 if (res != CUDA_SUCCESS) {
668 LOG(ERROR) << "failed to memset memory: " << ToString(res);
669 return false;
670 }
671 return true;
672 }
673
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)674 /* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
675 CUdeviceptr location,
676 uint32 value,
677 size_t uint32_count) {
678 ScopedActivateContext activation(context);
679 CUresult res = cuMemsetD32(location, value, uint32_count);
680 if (res != CUDA_SUCCESS) {
681 LOG(ERROR) << "failed to memset memory: " << ToString(res);
682 return false;
683 }
684 return true;
685 }
686
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)687 /* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
688 CUdeviceptr location,
689 uint8 value,
690 size_t uint32_count,
691 CUstream stream) {
692 ScopedActivateContext activation(context);
693 CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
694 if (res != CUDA_SUCCESS) {
695 LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
696 return false;
697 }
698 VLOG(2) << "successfully enqueued async memset operation";
699 return true;
700 }
701
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)702 /* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
703 CUdeviceptr location,
704 uint32 value,
705 size_t uint32_count,
706 CUstream stream) {
707 ScopedActivateContext activation(context);
708 CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
709 if (res != CUDA_SUCCESS) {
710 LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
711 return false;
712 }
713 VLOG(2) << "successfully enqueued async memset operation";
714 return true;
715 }
716
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)717 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
718 CUstream stream,
719 StreamCallback callback,
720 void* data) {
721 // Note: flags param is required to be zero according to CUDA 6.0.
722 CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
723 if (res != CUDA_SUCCESS) {
724 LOG(ERROR) << "unable to add host callback: " << ToString(res);
725 return false;
726 }
727 return true;
728 }
729
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)730 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
731 CUmodule module,
732 const char* kernel_name,
733 CUfunction* function) {
734 ScopedActivateContext activated{context};
735 CHECK(module != nullptr && kernel_name != nullptr);
736 CUresult res = cuModuleGetFunction(function, module, kernel_name);
737 if (res != CUDA_SUCCESS) {
738 LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
739 << "\" from module: " << ToString(res);
740 return false;
741 }
742
743 return true;
744 }
745
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)746 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
747 CUmodule module,
748 const char* symbol_name,
749 CUdeviceptr* dptr, size_t* bytes) {
750 ScopedActivateContext activated{context};
751 CHECK(module != nullptr && symbol_name != nullptr &&
752 (dptr != nullptr || bytes != nullptr));
753 CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
754 if (res != CUDA_SUCCESS) {
755 // symbol may not be found in the current module, but it may reside in
756 // another module.
757 VLOG(2) << "failed to get symbol \"" << symbol_name
758 << "\" from module: " << ToString(res);
759 return false;
760 }
761
762 return true;
763 }
764
UnloadModule(GpuContext * context,CUmodule module)765 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
766 CUmodule module) {
767 ScopedActivateContext activated{context};
768 CUresult res = cuModuleUnload(module);
769 if (res != CUDA_SUCCESS) {
770 LOG(ERROR) << "failed to unload module " << module
771 << "; leaking: " << ToString(res);
772 }
773 }
774
DeviceFromContext(GpuContext * context)775 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
776 GpuContext* context) {
777 ScopedActivateContext activated{context};
778 CUdevice device = -1;
779 CUresult result = cuCtxGetDevice(&device);
780 if (result == CUDA_SUCCESS) {
781 return device;
782 }
783
784 return port::Status(
785 port::error::INTERNAL,
786 absl::StrCat("failed to get device for context: ", ToString(result)));
787 }
788
CreateStream(GpuContext * context,CUstream * stream)789 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
790 CUstream* stream) {
791 // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
792 // up synchronization with respect to memsets and any other things that have
793 // to occur on the default stream?
794 ScopedActivateContext activated{context};
795 CUresult res = cuStreamCreate(stream, 0);
796 if (res != CUDA_SUCCESS) {
797 LOG(ERROR) << "could not allocate CUDA stream for context "
798 << context->context() << ": " << ToString(res);
799 return false;
800 }
801
802 VLOG(2) << "successfully created stream " << *stream << " for context "
803 << context->context() << " on thread";
804 return true;
805 }
806
DestroyStream(GpuContext * context,CUstream * stream)807 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
808 CUstream* stream) {
809 if (*stream == nullptr) {
810 return;
811 }
812
813 ScopedActivateContext activated{context};
814 CUresult res = cuStreamDestroy(*stream);
815 if (res != CUDA_SUCCESS) {
816 LOG(ERROR) << "failed to destroy CUDA stream for context "
817 << context->context() << ": " << ToString(res);
818 } else {
819 VLOG(2) << "successfully destroyed stream " << *stream << " for context "
820 << context->context();
821 *stream = nullptr;
822 }
823 }
824
DeviceAllocate(GpuContext * context,uint64 bytes)825 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
826 uint64 bytes) {
827 if (bytes == 0) {
828 return nullptr;
829 }
830
831 ScopedActivateContext activated{context};
832 CUdeviceptr result = 0;
833 CUresult res = cuMemAlloc(&result, bytes);
834 if (res != CUDA_SUCCESS) {
835 LOG(ERROR) << "failed to allocate "
836 << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
837 << " bytes) from device: " << ToString(res);
838 return nullptr;
839 }
840 void* ptr = reinterpret_cast<void*>(result);
841 VLOG(2) << "allocated " << ptr << " for context " << context->context()
842 << " of " << bytes << " bytes";
843 return ptr;
844 }
845
DeviceDeallocate(GpuContext * context,void * location)846 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
847 void* location) {
848 ScopedActivateContext activation(context);
849 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
850 CUresult res = cuMemFree(pointer);
851 if (res != CUDA_SUCCESS) {
852 LOG(ERROR) << "failed to free device memory at " << location
853 << "; result: " << ToString(res);
854 } else {
855 VLOG(2) << "deallocated " << location << " for context "
856 << context->context();
857 }
858 }
859
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)860 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
861 uint64 bytes) {
862 ScopedActivateContext activation(context);
863 CUdeviceptr result = 0;
864 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
865 CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
866 if (res != CUDA_SUCCESS) {
867 LOG(ERROR) << "failed to alloc " << bytes
868 << " bytes unified memory; result: " << ToString(res);
869 return nullptr;
870 }
871 void* ptr = reinterpret_cast<void*>(result);
872 VLOG(2) << "allocated " << ptr << " for context " << context->context()
873 << " of " << bytes << " bytes in unified memory";
874 return ptr;
875 }
876
UnifiedMemoryDeallocate(GpuContext * context,void * location)877 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
878 void* location) {
879 ScopedActivateContext activation(context);
880 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
881 CUresult res = cuMemFree(pointer);
882 if (res != CUDA_SUCCESS) {
883 LOG(ERROR) << "failed to free unified memory at " << location
884 << "; result: " << ToString(res);
885 } else {
886 VLOG(2) << "deallocated unified memory at " << location << " for context "
887 << context->context();
888 }
889 }
890
HostAllocate(GpuContext * context,uint64 bytes)891 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
892 ScopedActivateContext activation(context);
893 void* host_mem = nullptr;
894 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
895 CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
896 if (res != CUDA_SUCCESS) {
897 LOG(ERROR) << "failed to alloc " << bytes
898 << " bytes on host: " << ToString(res);
899 }
900 return host_mem;
901 }
902
HostDeallocate(GpuContext * context,void * location)903 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
904 void* location) {
905 ScopedActivateContext activation(context);
906 CUresult res = cuMemFreeHost(location);
907 if (res != CUDA_SUCCESS) {
908 LOG(ERROR) << "error deallocating host memory at " << location << ": "
909 << ToString(res);
910 }
911 }
912
HostRegister(GpuContext * context,void * location,uint64 bytes)913 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
914 uint64 bytes) {
915 ScopedActivateContext activation(context);
916 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
917 CUresult res =
918 cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
919 if (res != CUDA_SUCCESS) {
920 LOG(ERROR) << "error registering host memory at " << location << ": "
921 << ToString(res);
922 return false;
923 }
924 return true;
925 }
926
HostUnregister(GpuContext * context,void * location)927 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
928 void* location) {
929 ScopedActivateContext activation(context);
930 CUresult res = cuMemHostUnregister(location);
931 if (res != CUDA_SUCCESS) {
932 LOG(ERROR) << "error unregistering host memory at " << location << ": "
933 << ToString(res);
934 return false;
935 }
936 return true;
937 }
938
DestroyEvent(GpuContext * context,CUevent * event)939 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
940 CUevent* event) {
941 if (*event == nullptr) {
942 return port::Status(port::error::INVALID_ARGUMENT,
943 "input event cannot be null");
944 }
945
946 ScopedActivateContext activated{context};
947 CUresult res = cuEventDestroy(*event);
948 *event = nullptr;
949
950 switch (res) {
951 case CUDA_SUCCESS:
952 return port::Status::OK();
953 case CUDA_ERROR_DEINITIALIZED:
954 case CUDA_ERROR_NOT_INITIALIZED:
955 return port::Status(
956 port::error::FAILED_PRECONDITION,
957 port::Printf("error destroying CUDA event in context %p: %s", context,
958 ToString(res).c_str()));
959 default:
960 return port::Status(
961 port::error::INTERNAL,
962 port::Printf("error destroying CUDA event in context %p: %s", context,
963 ToString(res).c_str()));
964 }
965 }
966
RecordEvent(GpuContext * context,CUevent event,CUstream stream)967 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
968 CUevent event,
969 CUstream stream) {
970 ScopedActivateContext activated{context};
971 CUresult res = cuEventRecord(event, stream);
972 switch (res) {
973 case CUDA_SUCCESS:
974 return port::Status::OK();
975 case CUDA_ERROR_DEINITIALIZED:
976 case CUDA_ERROR_NOT_INITIALIZED:
977 return port::Status(
978 port::error::FAILED_PRECONDITION,
979 port::Printf("error recording CUDA event on stream %p: %s", stream,
980 ToString(res).c_str()));
981 default:
982 return port::Status(
983 port::error::INVALID_ARGUMENT,
984 port::Printf("error recording CUDA event on stream %p: %s", stream,
985 ToString(res).c_str()));
986 }
987 }
988
QueryEvent(GpuContext * context,CUevent event)989 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
990 CUevent event) {
991 ScopedActivateContext activated{context};
992 CUresult res = cuEventQuery(event);
993 if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
994 return port::Status(
995 port::error::INTERNAL,
996 port::Printf("failed to query event: %s", ToString(res).c_str()));
997 }
998
999 return res;
1000 }
1001
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1002 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1003 float* elapsed_milliseconds,
1004 CUevent start, CUevent stop) {
1005 ScopedActivateContext activated{context};
1006 // The stop event must have completed in order for cuEventElapsedTime to
1007 // work.
1008 CUresult res = cuEventSynchronize(stop);
1009 if (res != CUDA_SUCCESS) {
1010 LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1011 return false;
1012 }
1013 res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1014 if (res != CUDA_SUCCESS) {
1015 LOG(ERROR) << "failed to get elapsed time between events: "
1016 << ToString(res);
1017 return false;
1018 }
1019
1020 return true;
1021 }
1022
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1023 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1024 CUstream stream, CUevent event) {
1025 ScopedActivateContext activation(context);
1026 CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1027 if (res != CUDA_SUCCESS) {
1028 LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1029 return false;
1030 }
1031
1032 return true;
1033 }
1034
SynchronizeContext(GpuContext * context)1035 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1036 ScopedActivateContext activation(context);
1037 CUresult res = cuCtxSynchronize();
1038 if (res != CUDA_SUCCESS) {
1039 LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1040 << " :: " << port::CurrentStackTrace();
1041 return false;
1042 }
1043
1044 return true;
1045 }
1046
SynchronizeStream(GpuContext * context,CUstream stream)1047 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1048 CUstream stream) {
1049 ScopedActivateContext activated{context};
1050 CHECK(stream != nullptr);
1051 CUresult res = cuStreamSynchronize(stream);
1052 if (res != CUDA_SUCCESS) {
1053 port::Status status = port::InternalError(
1054 absl::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
1055 LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
1056 return status;
1057 }
1058 VLOG(2) << "successfully synchronized stream " << stream << " on context "
1059 << context;
1060 return port::Status::OK();
1061 }
1062
IsStreamIdle(GpuContext * context,CUstream stream)1063 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1064 CUstream stream) {
1065 ScopedActivateContext activated{context};
1066 CHECK(stream != nullptr);
1067 CUresult res = cuStreamQuery(stream);
1068 if (res == CUDA_SUCCESS) {
1069 return true;
1070 }
1071
1072 if (res != CUDA_ERROR_NOT_READY) {
1073 LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1074 }
1075 return false;
1076 }
1077
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size)1078 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1079 void* host_dst,
1080 CUdeviceptr gpu_src,
1081 uint64 size) {
1082 ScopedActivateContext activation(context);
1083 if (size > 0) {
1084 CHECK(PointerIsValid(gpu_src))
1085 << "Source pointer is not actually on GPU: " << gpu_src;
1086 CHECK(PointerIsValid(host_dst))
1087 << "Destination pointer is not actually on CPU: " << host_dst;
1088 }
1089 CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
1090 if (res != CUDA_SUCCESS) {
1091 return port::InternalError(
1092 port::Printf("failed to synchronous memcpy from device to host: %s; "
1093 "host dst: %p; GPU src: %p; size: %llu=0x%llx",
1094 ToString(res).c_str(), host_dst,
1095 absl::bit_cast<void*>(gpu_src), size, size));
1096 }
1097 VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1098 << host_dst;
1099 return port::Status::OK();
1100 }
1101
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size)1102 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1103 CUdeviceptr gpu_dst,
1104 const void* host_src,
1105 uint64 size) {
1106 ScopedActivateContext activation(context);
1107 if (size > 0) {
1108 CHECK(PointerIsValid(host_src))
1109 << "Source pointer is not actually on CPU: " << host_src;
1110 CHECK(PointerIsValid(gpu_dst))
1111 << "Destination pointer is not actually on GPU: " << gpu_dst;
1112 }
1113 CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
1114 if (res != CUDA_SUCCESS) {
1115 return port::InternalError(port::Printf(
1116 "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
1117 " host src: %p; size: %llu=0x%llx",
1118 ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
1119 size));
1120 }
1121 VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1122 return port::Status::OK();
1123 }
1124
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size)1125 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1126 CUdeviceptr gpu_dst,
1127 CUdeviceptr gpu_src,
1128 uint64 size) {
1129 ScopedActivateContext activation(context);
1130 if (size > 0) {
1131 CHECK(PointerIsValid(gpu_src))
1132 << "Source pointer is not actually on GPU: " << gpu_src;
1133 CHECK(PointerIsValid(gpu_dst))
1134 << "Destination pointer is not actually on GPU: " << gpu_dst;
1135 }
1136 CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
1137 if (res != CUDA_SUCCESS) {
1138 return port::InternalError(port::Printf(
1139 "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
1140 "GPU src: %p; size: %llu=0x%llx",
1141 ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
1142 absl::bit_cast<void*>(gpu_src), size, size));
1143 }
1144 VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1145 return port::Status::OK();
1146 }
1147
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1148 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1149 void* host_dst,
1150 CUdeviceptr gpu_src,
1151 uint64 size,
1152 CUstream stream) {
1153 ScopedActivateContext activation(context);
1154 if (size > 0) {
1155 CHECK(PointerIsValid(gpu_src))
1156 << "Source pointer is not actually on GPU: " << gpu_src;
1157 CHECK(PointerIsValid(host_dst))
1158 << "Destination pointer is not actually on CPU: " << host_dst;
1159 }
1160 CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1161 if (res != CUDA_SUCCESS) {
1162 LOG(ERROR) << port::Printf(
1163 "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1164 "GPU src: %p; size: %llu=0x%llx",
1165 ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
1166 size);
1167 return false;
1168 }
1169 VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1170 << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1171 << host_dst << " on stream " << stream;
1172 return true;
1173 }
1174
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size,CUstream stream)1175 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1176 CUdeviceptr gpu_dst,
1177 const void* host_src,
1178 uint64 size,
1179 CUstream stream) {
1180 ScopedActivateContext activation(context);
1181 if (size > 0) {
1182 CHECK(PointerIsValid(host_src))
1183 << "Source pointer is not actually on CPU: " << host_src;
1184 CHECK(PointerIsValid(gpu_dst))
1185 << "Destination pointer is not actually on GPU: " << gpu_dst;
1186 }
1187 CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1188 if (res != CUDA_SUCCESS) {
1189 LOG(ERROR) << port::Printf(
1190 "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1191 "host src: %p; size: %llu=0x%llx",
1192 ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
1193 size);
1194 return false;
1195 }
1196 VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1197 << " on stream " << stream;
1198 return true;
1199 }
1200
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1201 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1202 CUdeviceptr gpu_dst,
1203 CUdeviceptr gpu_src,
1204 uint64 size,
1205 CUstream stream) {
1206 ScopedActivateContext activation(context);
1207 if (size > 0) {
1208 CHECK(PointerIsValid(gpu_src))
1209 << "Source pointer is not actually on GPU: " << gpu_src;
1210 CHECK(PointerIsValid(gpu_dst))
1211 << "Destination pointer is not actually on GPU: " << gpu_dst;
1212 }
1213 CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1214 if (result != CUDA_SUCCESS) {
1215 LOG(ERROR) << port::Printf(
1216 "failed to enqueue async memcpy from device to device: %s"
1217 "; GPU dst: %p on %s %s"
1218 "; GPU src: %p on %s %s"
1219 "; can access? %s; size: %llu=0x%llx",
1220 ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
1221 CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
1222 CUDAPointerToDeviceString(gpu_dst).c_str(),
1223 absl::bit_cast<void*>(gpu_src),
1224 CUDAPointerToMemorySpaceString(gpu_src).c_str(),
1225 CUDAPointerToDeviceString(gpu_src).c_str(),
1226 CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
1227
1228 return false;
1229 }
1230 VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1231 return true;
1232 }
1233
CreateEvent(GpuContext * context,CUevent * result,EventFlags flags)1234 /* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
1235 CUevent* result,
1236 EventFlags flags) {
1237 int cuflags;
1238 switch (flags) {
1239 case EventFlags::kDefault:
1240 cuflags = CU_EVENT_DEFAULT;
1241 break;
1242 case EventFlags::kDisableTiming:
1243 cuflags = CU_EVENT_DISABLE_TIMING;
1244 break;
1245 default:
1246 LOG(FATAL) << "impossible event flags: " << int(flags);
1247 }
1248
1249 ScopedActivateContext activated{context};
1250 CUresult res = cuEventCreate(result, cuflags);
1251
1252 if (res == CUDA_SUCCESS) {
1253 return port::Status::OK();
1254 } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1255 return port::Status(port::error::RESOURCE_EXHAUSTED,
1256 "could not create CUDA event: out of device memory");
1257 } else {
1258 return port::Status(
1259 port::error::FAILED_PRECONDITION,
1260 absl::StrCat("could not create CUDA event: ", ToString(res)));
1261 }
1262 }
1263
GetDeviceCount()1264 /* static */ int GpuDriver::GetDeviceCount() {
1265 int device_count = 0;
1266 CUresult res = cuDeviceGetCount(&device_count);
1267 if (res != CUDA_SUCCESS) {
1268 LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1269 return 0;
1270 }
1271
1272 if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1273 device_count = 1;
1274 }
1275 return device_count;
1276 }
1277
GetPointerContext(CUdeviceptr pointer)1278 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1279 CUdeviceptr pointer) {
1280 GpuContext* context = nullptr;
1281 CUresult result =
1282 cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1283 if (result == CUDA_SUCCESS) {
1284 CHECK(context != nullptr) << "success should entail non-null context";
1285 return context;
1286 }
1287
1288 return port::Status(
1289 port::error::INTERNAL,
1290 absl::StrCat("failed to query device pointer for context: ",
1291 ToString(result)));
1292 }
1293
GetPointerMemorySpace(CUdeviceptr pointer)1294 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1295 CUdeviceptr pointer) {
1296 unsigned int value;
1297 CUresult result =
1298 cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1299 if (result == CUDA_SUCCESS) {
1300 switch (value) {
1301 case CU_MEMORYTYPE_DEVICE:
1302 return MemorySpace::kDevice;
1303 case CU_MEMORYTYPE_HOST:
1304 return MemorySpace::kHost;
1305 default:
1306 return port::Status(
1307 port::error::INTERNAL,
1308 absl::StrCat("unknown memory space provided by CUDA API: ", value));
1309 }
1310 }
1311
1312 return port::Status(
1313 port::error::INTERNAL,
1314 absl::StrCat("failed to query device pointer for memory space: ",
1315 ToString(result)));
1316 }
1317
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1318 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1319 CUdeviceptr* base,
1320 size_t* size) {
1321 CUresult result = cuMemGetAddressRange(base, size, dptr);
1322 if (result == CUDA_SUCCESS) {
1323 return port::Status::OK();
1324 } else if (result == CUDA_ERROR_NOT_FOUND) {
1325 // We differentiate between "this pointer is unknown" (return here) and
1326 // "there was an internal error while performing this operation" (return
1327 // below).
1328 return port::Status(
1329 port::error::NOT_FOUND,
1330 port::Printf("not a device pointer %p; %s",
1331 reinterpret_cast<void*>(dptr), ToString(result).c_str()));
1332 }
1333
1334 return port::Status(
1335 port::error::INTERNAL,
1336 port::Printf("failed to get pointer into for device pointer %p; %s",
1337 reinterpret_cast<void*>(dptr), ToString(result).c_str()));
1338 }
1339
GetPointerDevice(CUdeviceptr pointer)1340 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1341 CUdeviceptr pointer) {
1342 auto result = GetPointerContext(pointer);
1343 if (!result.ok()) {
1344 return result.status();
1345 }
1346
1347 return DeviceFromContext(result.ValueOrDie());
1348 }
1349
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1350 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1351 int* cc_minor,
1352 CUdevice device) {
1353 *cc_major = 0;
1354 *cc_minor = 0;
1355
1356 CUresult res = cuDeviceGetAttribute(
1357 cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1358 if (res != CUDA_SUCCESS) {
1359 return port::Status(
1360 port::error::INTERNAL,
1361 port::Printf(
1362 "failed to get compute capability major for device: %s; %d",
1363 ToString(res).c_str(), device));
1364 }
1365
1366 res = cuDeviceGetAttribute(
1367 cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1368 if (res != CUDA_SUCCESS) {
1369 return port::Status(
1370 port::error::INTERNAL,
1371 port::Printf(
1372 "failed to get compute capability minor for device: %s; %d",
1373 ToString(res).c_str(), device));
1374 }
1375
1376 return port::Status::OK();
1377 }
1378
GetGpuISAVersion(int * version,CUdevice device)1379 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1380 CUdevice device) {
1381 return port::Status{
1382 port::error::INTERNAL,
1383 "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1384 }
1385
1386 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1387 // T and wraps it in a StatusOr.
1388 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1389 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1390 CUdevice_attribute attribute) {
1391 int value = -1;
1392 CUresult result = cuDeviceGetAttribute(&value, attribute, device);
1393 if (result != CUDA_SUCCESS) {
1394 return port::Status(
1395 port::error::NOT_FOUND,
1396 absl::StrCat("could not retrieve CUDA device attribute (", attribute,
1397 "): ", ToString(result)));
1398 }
1399 T converted = value;
1400 return converted;
1401 }
1402
GetMultiprocessorCount(CUdevice device)1403 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1404 CUdevice device) {
1405 return GetSimpleAttribute<int>(device,
1406 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1407 }
1408
GetMaxSharedMemoryPerCore(CUdevice device)1409 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1410 CUdevice device) {
1411 return GetSimpleAttribute<int64>(
1412 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1413 }
1414
GetMaxSharedMemoryPerBlock(CUdevice device)1415 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1416 CUdevice device) {
1417 return GetSimpleAttribute<int64>(
1418 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1419 }
1420
GetMaxThreadsPerMultiprocessor(CUdevice device)1421 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1422 CUdevice device) {
1423 return GetSimpleAttribute<int64>(
1424 device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1425 }
1426
GetMaxThreadsPerBlock(CUdevice device)1427 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1428 CUdevice device) {
1429 return GetSimpleAttribute<int64>(device,
1430 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1431 }
1432
GetMaxRegistersPerBlock(CUdevice device)1433 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1434 CUdevice device) {
1435 return GetSimpleAttribute<int64>(device,
1436 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1437 }
1438
GetThreadsPerWarp(CUdevice device)1439 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1440 CUdevice device) {
1441 return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1442 }
1443
GetGridLimits(int * x,int * y,int * z,CUdevice device)1444 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1445 CUdevice device) {
1446 int value;
1447 CUresult res =
1448 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1449 if (res != CUDA_SUCCESS) {
1450 LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1451 return false;
1452 }
1453 *x = value;
1454
1455 res =
1456 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1457 if (res != CUDA_SUCCESS) {
1458 LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1459 return false;
1460 }
1461 *y = value;
1462
1463 res =
1464 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1465 if (res != CUDA_SUCCESS) {
1466 LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1467 return false;
1468 }
1469 *z = value;
1470 return true;
1471 }
1472
GetDriverVersion(int * driver_version)1473 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1474 CUresult res = cuDriverGetVersion(driver_version);
1475 if (res != CUDA_SUCCESS) {
1476 LOG(ERROR) << "failed to query driver version: " << ToString(res);
1477 return false;
1478 }
1479
1480 return true;
1481 }
1482
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1483 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1484 int device_ordinal) {
1485 CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1486 if (res != CUDA_SUCCESS) {
1487 LOG(ERROR) << "failed to query device properties: " << ToString(res);
1488 return false;
1489 }
1490
1491 return true;
1492 }
1493
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1494 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1495 CUdevice_attribute attribute, CUdevice device) {
1496 int val;
1497 CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1498 if (res != CUDA_SUCCESS) {
1499 return port::Status(
1500 port::error::INTERNAL,
1501 port::Printf("failed to get device attribute %d for device %d: %s",
1502 attribute, device, ToString(res).c_str()));
1503 }
1504 return val;
1505 }
1506
IsEccEnabled(CUdevice device,bool * result)1507 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1508 int value = -1;
1509 CUresult res =
1510 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1511 if (res != CUDA_SUCCESS) {
1512 LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1513 return false;
1514 }
1515
1516 *result = value;
1517 return true;
1518 }
1519
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1520 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1521 int64* free_out,
1522 int64* total_out) {
1523 ScopedActivateContext activation(context);
1524 size_t free = 0;
1525 size_t total = 0;
1526 CUresult res = cuMemGetInfo(&free, &total);
1527 if (res != CUDA_SUCCESS) {
1528 LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1529 return false;
1530 }
1531
1532 *free_out = free;
1533 *total_out = total;
1534 return true;
1535 }
1536
GetDeviceTotalMemory(CUdevice device,uint64 * result)1537 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1538 uint64* result) {
1539 size_t value = -1;
1540 CUresult res = cuDeviceTotalMem(&value, device);
1541 if (res != CUDA_SUCCESS) {
1542 LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1543 return false;
1544 }
1545
1546 *result = value;
1547 return true;
1548 }
1549
GetPCIBusID(CUdevice device)1550 /* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
1551 string pci_bus_id;
1552 static const int kBufferSize = 64;
1553 absl::InlinedVector<char, 4> chars(kBufferSize);
1554 chars[kBufferSize - 1] = '\0';
1555 CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1556 if (res != CUDA_SUCCESS) {
1557 LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1558 return pci_bus_id;
1559 }
1560 pci_bus_id = chars.begin();
1561 return pci_bus_id;
1562 }
1563
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1564 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1565 GpuContext* to) {
1566 if (from == to) {
1567 return true; // A context can always access its own memory.
1568 }
1569
1570 int can_access_peer = -1;
1571 auto from_device = DeviceFromContext(from);
1572 if (!from_device.ok()) {
1573 LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1574 << from_device.status();
1575 return false;
1576 }
1577 auto to_device = DeviceFromContext(to);
1578 if (!to_device.ok()) {
1579 LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1580 << to_device.status();
1581 return false;
1582 }
1583 CUresult res = cuDeviceCanAccessPeer(
1584 &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
1585 if (res != CUDA_SUCCESS) {
1586 LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
1587 return false;
1588 }
1589
1590 return can_access_peer;
1591 }
1592
EnablePeerAccess(GpuContext * from,GpuContext * to)1593 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1594 GpuContext* to) {
1595 if (from == to) {
1596 return port::Status::OK(); // A context can always access its own memory.
1597 }
1598
1599 ScopedActivateContext activated{from};
1600 CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1601 if (result != CUDA_SUCCESS &&
1602 result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1603 return port::Status(
1604 port::error::INTERNAL,
1605 port::Printf("failed to enable peer access from %p to %p: %s", from, to,
1606 ToString(result).c_str()));
1607 }
1608
1609 return port::Status::OK();
1610 }
1611
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1612 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1613 GpuContext* context, CUfunction kernel, int threads_per_block,
1614 size_t dynamic_shared_memory_bytes) {
1615 ScopedActivateContext activation(context);
1616
1617 int max_blocks;
1618 CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
1619 &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
1620 if (result != CUDA_SUCCESS) {
1621 return port::Status(
1622 port::error::INTERNAL,
1623 port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
1624 ToString(result).c_str()));
1625 }
1626
1627 return max_blocks;
1628 }
1629
1630 } // namespace gpu
1631
1632 namespace cuda {
1633
CurrentContextOrDie()1634 CUcontext CurrentContextOrDie() {
1635 CUcontext current = nullptr;
1636 CUresult result = cuCtxGetCurrent(¤t);
1637 if (result != CUDA_SUCCESS) {
1638 LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
1639 }
1640 return current;
1641 }
1642
1643 } // namespace cuda
1644 } // namespace stream_executor
1645