1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
17
18 #if defined(__APPLE__)
19 #include <mach-o/dyld.h>
20 #endif
21 #if defined(PLATFORM_WINDOWS)
22 #include <windows.h>
23 #define PATH_MAX MAX_PATH
24 #else
25 #include <unistd.h>
26 #endif
27 #include "absl/strings/str_cat.h"
28 #include "absl/strings/string_view.h"
29 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
30 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
31 #include "tensorflow/stream_executor/cuda/cuda_event.h"
32 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
33 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
34 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
35 #include "tensorflow/stream_executor/kernel_cache_config.h"
36 #include "tensorflow/stream_executor/lib/env.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/initialize.h"
39 #include "tensorflow/stream_executor/lib/mathutil.h"
40 #include "tensorflow/stream_executor/lib/numbers.h"
41 #include "tensorflow/stream_executor/lib/path.h"
42 #include "tensorflow/stream_executor/lib/process_state.h"
43 #include "tensorflow/stream_executor/lib/ptr_util.h"
44 #include "tensorflow/stream_executor/lib/statusor.h"
45 #include "tensorflow/stream_executor/lib/str_util.h"
46 #include "tensorflow/stream_executor/lib/stringprintf.h"
47 #include "tensorflow/stream_executor/platform.h"
48 #include "tensorflow/stream_executor/platform/logging.h"
49 #include "tensorflow/stream_executor/platform/port.h"
50 #include "tensorflow/stream_executor/plugin_registry.h"
51 #include "tensorflow/stream_executor/stream.h"
52 #include "tensorflow/stream_executor/stream_executor_internal.h"
53 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
54 #include "tensorflow/stream_executor/timer.h"
55
56 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
57 // always unwanted. This happens on Windows that defines such a macro.
58 #undef ERROR
59
60 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
61 #error \
62 "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
63 #endif
64
65 #ifdef __CUDA_RUNTIME_H__
66 #error \
67 "CUDA runtime being included into CUDA GPU executor; should be driver only."
68 #endif
69
70 extern bool FLAGS_check_gpu_leaks;
71 bool FLAGS_prefer_cubin_to_ptx = true;
72
73 namespace stream_executor {
74 namespace gpu {
75
76 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
77 // It has been observed that loading both PTX and cubins into the driver library
78 // can cause it to crash, but loading only CUBINs avoids those crashes;
79 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
80 // PTX code.
81 //
82 // As this is an implementation-detail workaround, the usage is to declare this
83 // variable with extern linkage and populate it from another translation unit.
84 std::function<string(const string &)> g_cubinate;
85
AsGpuEvent(Event * event)86 static GpuEvent* AsGpuEvent(Event* event) {
87 DCHECK(event != nullptr);
88 return static_cast<GpuEvent*>(event->implementation());
89 }
90
91 // Given a platform-independent timer datatype, returns the internal CUDA
92 // platform implementation pointer.
AsGpuTimer(Timer * timer)93 static GpuTimer* AsGpuTimer(Timer* timer) {
94 DCHECK(timer != nullptr);
95 return static_cast<GpuTimer*>(timer->implementation());
96 }
97
98 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
99 // for passing directly to libcuda APIs.
100 //
101 // N.B. we must lose constness in order to pass a suitable type to the existing
102 // libcuda APIs, so the caller should take care to only pass the result of const
103 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)104 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) {
105 return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
106 }
107
108 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)109 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
110 return AsCudaDevicePtr(*gpu_mem);
111 }
112
ExtractGpuContext(GpuExecutor * cuda_exec)113 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
114 CHECK(cuda_exec != nullptr);
115 return cuda_exec->gpu_context();
116 }
117
ExtractGpuExecutor(StreamExecutor * stream_exec)118 GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
119 return static_cast<GpuExecutor*>(stream_exec->implementation());
120 }
121
~GpuExecutor()122 GpuExecutor::~GpuExecutor() {
123 CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
124 CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
125 if (context_ != nullptr) {
126 GpuDriver::DestroyContext(context_);
127 }
128 }
129
Init(int device_ordinal,DeviceOptions device_options)130 port::Status GpuExecutor::Init(int device_ordinal,
131 DeviceOptions device_options) {
132 device_ordinal_ = device_ordinal;
133
134 auto status = GpuDriver::Init();
135 if (!status.ok()) {
136 return status;
137 }
138
139 status = GpuDriver::GetDevice(device_ordinal_, &device_);
140 if (!status.ok()) {
141 return status;
142 }
143
144 status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
145 &context_);
146 if (!status.ok()) {
147 return status;
148 }
149
150 return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
151 }
152
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const153 bool GpuExecutor::FindOnDiskForComputeCapability(
154 absl::string_view filename, absl::string_view canonical_suffix,
155 string* found_filename) const {
156 if (cc_major_ == 0 && cc_minor_ == 0) {
157 return false;
158 }
159
160 string cc_specific =
161 absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
162 if (port::FileExists(cc_specific).ok()) {
163 VLOG(2) << "found compute-capability-specific file, using that: "
164 << cc_specific;
165 *found_filename = cc_specific;
166 return true;
167 }
168
169 VLOG(2) << "could not find compute-capability specific file at: "
170 << cc_specific;
171 if (port::FileExists(string(filename)).ok()) {
172 *found_filename = string(filename);
173 return true;
174 }
175
176 return false;
177 }
178
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const179 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
180 absl::string_view canonical_suffix,
181 string* found_filename) const {
182 LOG(ERROR)
183 << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
184 return false;
185 }
186 // Returns the path to the running executable.
187 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
188 // Arg: strip_exe: if true, remove the name of the executable itself from the
189 // returned string. Example: calling this from /usr/bin/foo
190 // would return /usr/bin.
GetBinaryDir(bool strip_exe)191 static string GetBinaryDir(bool strip_exe) {
192 char exe_path[PATH_MAX] = {0};
193 #if defined(__APPLE__)
194 uint32_t buffer_size = 0U;
195 _NSGetExecutablePath(nullptr, &buffer_size);
196 char unresolved_path[buffer_size];
197 _NSGetExecutablePath(unresolved_path, &buffer_size);
198 CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
199 #else
200 #if defined(PLATFORM_WINDOWS)
201 HMODULE hModule = GetModuleHandle(NULL);
202 GetModuleFileName(hModule, exe_path, MAX_PATH);
203 #else
204 CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
205 #endif
206 #endif
207 // Make sure it's null-terminated:
208 exe_path[sizeof(exe_path) - 1] = 0;
209
210 if (strip_exe) {
211 // The exe is the last component of the path, so remove one component.
212 string ret = exe_path;
213 std::vector<string> components = port::Split(exe_path, '/');
214 components.pop_back();
215 return port::Join(components, "/");
216 }
217 return exe_path;
218 }
219
LoadModuleFromCuBin(const char * cubin,CUmodule * module)220 bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
221 uint64_t module_refcount;
222 std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
223
224 if (*module == nullptr) {
225 auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
226 if (!load_status.ok()) {
227 LOG(ERROR) << "failed to load CUBIN: " << load_status;
228 return false;
229 }
230 module_refcount = 1;
231 VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
232 << " as module " << *module;
233 } else {
234 ++module_refcount;
235 VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
236 << " is already loaded as module " << *module;
237 }
238 gpu_binary_to_module_[cubin] = {*module, module_refcount};
239 return true;
240 }
241
LoadModuleFromPtx(const char * ptx,CUmodule * module)242 bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
243 uint64_t module_refcount;
244 std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
245
246 if (*module == nullptr) {
247 if (!GpuDriver::LoadPtx(context_, ptx, module)) {
248 return false;
249 }
250 VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
251 << *module;
252 module_refcount = 1;
253 } else {
254 ++module_refcount;
255 VLOG(3) << "PTX " << static_cast<const void *>(ptx)
256 << " is already loaded as module " << module;
257 }
258 gpu_binary_to_module_[ptx] = {*module, module_refcount};
259 return true;
260 }
261
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)262 bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
263 LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
264 return false;
265 }
266
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)267 bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
268 KernelBase* kernel) {
269 GpuKernel* cuda_kernel = AsGpuKernel(kernel);
270 CUmodule module;
271 const string *kernelname;
272
273 VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
274
275 if (spec.has_cuda_cubin_in_memory()) {
276 mutex_lock lock{in_memory_modules_mu_};
277 kernelname = &spec.cuda_cubin_in_memory().kernelname();
278 const char *cubin = spec.cuda_cubin_in_memory().bytes();
279 if (!LoadModuleFromCuBin(cubin, &module)) {
280 return false;
281 }
282 kernel_to_gpu_binary_[kernel] = cubin;
283 } else if (spec.has_cuda_ptx_in_memory()) {
284 kernelname = &spec.cuda_ptx_in_memory().kernelname();
285
286 if (cc_major_ == 0 && cc_minor_ == 0) {
287 return false;
288 }
289
290 const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
291 if (ptx == nullptr) {
292 ptx = spec.cuda_ptx_in_memory().default_text();
293 }
294 if (ptx == nullptr) {
295 LOG(FATAL) << "loader spec has no ptx for kernel " << *kernelname;
296 return false;
297 }
298
299 mutex_lock lock{in_memory_modules_mu_};
300 if (!LoadModuleFromPtx(ptx, &module)) {
301 return false;
302 }
303 kernel_to_gpu_binary_[kernel] = ptx;
304 } else {
305 LOG(WARNING) << "no method of loading CUDA kernel provided";
306 return false;
307 }
308 VLOG(2) << "getting function " << *kernelname << " from module " << module;
309 if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
310 cuda_kernel->gpu_function_ptr())) {
311 return false;
312 }
313
314 // We have to trust the kernel loader spec arity because there doesn't appear
315 // to be a way to reflect on the number of expected arguments w/the CUDA API.
316 cuda_kernel->set_arity(spec.arity());
317
318 KernelMetadata kernel_metadata;
319 if (!GetKernelMetadata(cuda_kernel, &kernel_metadata)) {
320 LOG(WARNING) << "unable to get metadata for kernel " << *kernelname;
321 }
322 kernel->set_metadata(kernel_metadata);
323 kernel->set_name(*kernelname);
324 return true;
325 }
326
UnloadGpuBinary(const void * gpu_binary)327 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
328 auto module_it = gpu_binary_to_module_.find(gpu_binary);
329 if (gpu_binary_to_module_.end() == module_it) {
330 VLOG(3) << "No loaded CUDA module for " << gpu_binary;
331 return false;
332 }
333 auto &module = module_it->second.first;
334 auto &refcount = module_it->second.second;
335 VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
336 if (--refcount == 0) {
337 VLOG(3) << "Unloading CUDA module " << module;
338 GpuDriver::UnloadModule(context_, module);
339 gpu_binary_to_module_.erase(module_it);
340 }
341 return true;
342 }
343
UnloadKernel(const KernelBase * kernel)344 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
345 VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
346
347 mutex_lock lock{in_memory_modules_mu_};
348 auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
349 if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
350 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
351 << " has never been loaded.";
352 return; // We've never seen this kernel.
353 }
354 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
355 << " has loaded GPU code " << gpu_binary_it->second;
356 UnloadGpuBinary(gpu_binary_it->second);
357 kernel_to_gpu_binary_.erase(gpu_binary_it);
358 }
359
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)360 bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
361 ModuleHandle* module_handle) {
362 // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
363 // ModuleHandle::id().
364 CUmodule cu_module;
365 if (spec.has_cuda_cubin_in_memory()) {
366 mutex_lock lock{in_memory_modules_mu_};
367 if (!LoadModuleFromCuBin(
368 reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
369 &cu_module)) {
370 return false;
371 }
372 *module_handle = ModuleHandle(const_cast<void *>(
373 static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
374 return true;
375 } else if (spec.has_cuda_ptx_in_memory()) {
376 if (cc_major_ == 0 && cc_minor_ == 0) {
377 return false;
378 }
379
380 if (!spec.cuda_ptx_in_memory()) {
381 return false;
382 }
383
384 mutex_lock lock{in_memory_modules_mu_};
385 if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
386 return false;
387 }
388 *module_handle = ModuleHandle(const_cast<void *>(
389 static_cast<const void *>(spec.cuda_ptx_in_memory())));
390 return true;
391 }
392 LOG(WARNING) << "no method of loading CUDA module provided";
393 return false;
394 }
395
UnloadModule(ModuleHandle module_handle)396 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
397 const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
398 mutex_lock lock{in_memory_modules_mu_};
399 return UnloadGpuBinary(gpu_binary);
400 }
401
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)402 bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
403 KernelMetadata* kernel_metadata) {
404 int value;
405 if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
406 *cuda_kernel->gpu_function_ptr(), &value)) {
407 return false;
408 }
409 kernel_metadata->set_registers_per_thread(value);
410
411 if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
412 *cuda_kernel->gpu_function_ptr(), &value)) {
413 return false;
414 }
415 kernel_metadata->set_shared_memory_bytes(value);
416
417 return true;
418 }
419
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)420 bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
421 const BlockDim& block_dims, const KernelBase& kernel,
422 const KernelArgsArrayBase& args) {
423 CHECK_EQ(kernel.Arity(), args.number_of_arguments());
424 CUstream custream = AsGpuStreamValue(stream);
425 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
426 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
427
428 // Only perform/print the occupancy check once. Even just checking to see
429 // whether we've done an occupancy check on this kernel before isn't free
430 // (because we have to synchronize), so we only do this at -v 2+.
431 if (VLOG_IS_ON(2)) {
432 mutex_lock lock(launched_kernels_mu_);
433 if (!launched_kernels_.count(cufunc)) {
434 VlogOccupancyInfo(kernel, thread_dims, block_dims);
435 // TODO(rspringer): Remove elements from launched_kernels_...if we ever
436 // expose a kernel/module deallocation method.
437 launched_kernels_.insert(cufunc);
438 }
439 }
440
441 if (cuda_kernel->GetPreferredCacheConfig() !=
442 KernelCacheConfig::kNoPreference) {
443 GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
444 }
445
446 void **kernel_params = const_cast<void **>(args.argument_addresses().data());
447
448 if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
449 block_dims.z, thread_dims.x, thread_dims.y,
450 thread_dims.z, args.number_of_shared_bytes(),
451 custream, kernel_params,
452 nullptr /* = extra */)) {
453 LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
454 << args.number_of_arguments()
455 << " args; thread dim: " << thread_dims.ToString()
456 << "; block dim: " << block_dims.ToString();
457 return false;
458 }
459
460 return true;
461 }
462
463 // This is a non-essential operation; if there's a failure, proceed without
464 // logging an error. It's nearly certain that in case of failures, we'd never
465 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)466 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
467 const ThreadDim& thread_dims,
468 const BlockDim& block_dims) {
469 VLOG(2) << "Computing kernel occupancy for kernel "
470 << kernel.demangled_name();
471 VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
472 << ", " << thread_dims.z << ")";
473
474 int regs_per_thread;
475 if (!kernel.metadata().registers_per_thread(®s_per_thread)) {
476 return;
477 }
478
479 int smem_per_block;
480 if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
481 return;
482 }
483
484 const DeviceDescription &device_description =
485 kernel.parent()->GetDeviceDescription();
486
487 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
488 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
489
490 int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
491 smem_per_block, thread_dims, cufunc);
492 VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
493
494 int suggested_threads =
495 CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
496 smem_per_block, thread_dims, cufunc);
497 if (suggested_threads != 0) {
498 VLOG(2) << "The cuda occupancy calculator recommends using "
499 << suggested_threads
500 << " threads per block to achieve an occupancy of " << blocks_per_sm
501 << " blocks per SM.";
502 }
503 }
504
505 // Compute and return maximum blocks per core (occupancy) based on the
506 // device description, some kernel characteristics and the number of threads per
507 // block. If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)508 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
509 uint64 registers_per_thread,
510 uint64 shared_memory_per_block,
511 const ThreadDim& thread_dims,
512 CUfunction func) {
513 int suggested_blocks = 0;
514 int suggested_threads = 0;
515 CUresult err = cuOccupancyMaxPotentialBlockSize(
516 &suggested_blocks, &suggested_threads, func, nullptr,
517 shared_memory_per_block, 0);
518 CHECK_EQ(err, CUDA_SUCCESS);
519 return suggested_blocks;
520 }
521
522 // Compute and return the suggested thread count to achieve ideal occupancy.
523 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)524 int GpuExecutor::CompareOccupancy(int* initial_blocks,
525 const DeviceDescription& device_description,
526 uint64 registers_per_thread,
527 uint64 shared_memory_per_block,
528 const ThreadDim& thread_dims,
529 CUfunction func) {
530 int suggested_blocks = 0;
531 int suggested_threads = 0;
532 CUresult err = cuOccupancyMaxPotentialBlockSize(
533 &suggested_blocks, &suggested_threads, func, nullptr,
534 shared_memory_per_block, 0);
535 CHECK_EQ(err, CUDA_SUCCESS);
536 if (suggested_blocks > *initial_blocks) {
537 *initial_blocks = suggested_blocks;
538 return suggested_threads;
539 } else {
540 return 0;
541 }
542 }
543
Allocate(uint64 size)544 void* GpuExecutor::Allocate(uint64 size) {
545 return GpuDriver::DeviceAllocate(context_, size);
546 }
547
AllocateSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)548 void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
549 uint64 size_bytes) {
550 // offset and size are in bytes, so char* works as the pointer type.
551 return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
552 }
553
Deallocate(DeviceMemoryBase * mem)554 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
555 // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
556 if (!mem->is_sub_buffer()) {
557 GpuDriver::DeviceDeallocate(context_, mem->opaque());
558 }
559 }
560
HostMemoryRegister(void * location,uint64 size)561 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
562 if (location == nullptr || size == 0) {
563 LOG(WARNING) << "attempting to register null or zero-sized memory: "
564 << location << "; size " << size;
565 }
566 VLOG(2) << "registering " << location << " size " << size;
567 return GpuDriver::HostRegister(context_, location, size);
568 }
569
HostMemoryUnregister(void * location)570 bool GpuExecutor::HostMemoryUnregister(void* location) {
571 VLOG(2) << "unregistering " << location;
572 return GpuDriver::HostUnregister(context_, location);
573 }
574
SynchronizeAllActivity()575 bool GpuExecutor::SynchronizeAllActivity() {
576 return GpuDriver::SynchronizeContext(context_);
577 }
578
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)579 bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
580 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
581 size % 4 == 0) {
582 return GpuDriver::SynchronousMemsetUint32(
583 context_, AsCudaDevicePtr(location), 0x0, size / 4);
584 }
585 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
586 0x0, size);
587 }
588
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)589 bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
590 uint64 size) {
591 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
592 size % 4 == 0) {
593 // cudaMemset reinterprets "value" as a uint8.
594 uint8 byte_value = static_cast<uint8>(value);
595 uint32 pattern = (byte_value << 24) | (byte_value << 16) |
596 (byte_value << 8) | byte_value;
597 return GpuDriver::SynchronousMemsetUint32(
598 context_, AsCudaDevicePtr(location), pattern, size / 4);
599 }
600 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
601 value, size);
602 }
603
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)604 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
605 const void* host_src, uint64 size) {
606 return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
607 host_src, size);
608 }
609
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)610 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
611 const DeviceMemoryBase& gpu_src,
612 uint64 size) {
613 return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
614 AsCudaDevicePtr(gpu_src), size);
615 }
616
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)617 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
618 DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
619 return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
620 AsCudaDevicePtr(gpu_src), size);
621 }
622
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)623 bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
624 uint64 size) {
625 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
626 size % 4 == 0) {
627 return Memset32(stream, location, 0x0, size);
628 } else {
629 return Memset(stream, location, 0x0, size);
630 }
631 }
632
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)633 bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
634 uint8 pattern, uint64 size) {
635 VLOG(2) << "enqueueing memset8 operation onto stream " << stream
636 << " at location " << location << " with size " << size
637 << " and pattern " << std::hex << pattern;
638 return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
639 pattern, size,
640 AsGpuStreamValue(stream));
641 }
642
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)643 bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
644 uint32 pattern, uint64 size) {
645 VLOG(2) << "enqueueing memset32 operation onto stream " << stream
646 << " at location " << location << " with size " << size
647 << " and pattern " << std::hex << pattern;
648 CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
649 size % 4 == 0);
650 return GpuDriver::AsynchronousMemsetUint32(
651 context_, AsCudaDevicePtr(location), pattern, size / 4,
652 AsGpuStreamValue(stream));
653 }
654
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)655 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
656 const DeviceMemoryBase& gpu_src, uint64 size) {
657 return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
658 AsCudaDevicePtr(gpu_src), size,
659 AsGpuStreamValue(stream));
660 }
661
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)662 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
663 const void* host_src, uint64 size) {
664 return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
665 host_src, size,
666 AsGpuStreamValue(stream));
667 }
668
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)669 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
670 DeviceMemoryBase* gpu_dst,
671 const DeviceMemoryBase& gpu_src,
672 uint64 size) {
673 return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
674 AsCudaDevicePtr(gpu_src), size,
675 AsGpuStreamValue(stream));
676 }
677
HostCallback(Stream * stream,std::function<port::Status ()> callback)678 bool GpuExecutor::HostCallback(Stream* stream,
679 std::function<port::Status()> callback) {
680 auto callback_ptr = new std::function<void()>([callback]() {
681 port::Status s = callback();
682 if (!s.ok()) {
683 LOG(WARNING) << "Host callback failed: " << s;
684 }
685 });
686 return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
687 InternalHostCallback, callback_ptr);
688 }
689
InternalHostCallback(CUstream stream,CUresult status,void * data)690 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
691 CUresult status,
692 void* data) {
693 std::function<void()> *callback =
694 reinterpret_cast<std::function<void()> *>(data);
695 (*callback)();
696 delete callback;
697 }
698
AllocateEvent(Event * event)699 port::Status GpuExecutor::AllocateEvent(Event* event) {
700 return AsGpuEvent(event)->Init();
701 }
702
DeallocateEvent(Event * event)703 port::Status GpuExecutor::DeallocateEvent(Event* event) {
704 return AsGpuEvent(event)->Destroy();
705 }
706
RecordEvent(Stream * stream,Event * event)707 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
708 return AsGpuEvent(event)->Record(AsGpuStream(stream));
709 }
710
WaitForEvent(Stream * stream,Event * event)711 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
712 if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
713 AsGpuEvent(event)->gpu_event())) {
714 return port::Status::OK();
715 } else {
716 return port::Status(
717 port::error::INTERNAL,
718 port::Printf("error recording waiting for CUDA event on stream %p",
719 stream));
720 }
721 }
722
PollForEventStatus(Event * event)723 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
724 return AsGpuEvent(event)->PollForStatus();
725 }
726
AllocateStream(Stream * stream)727 bool GpuExecutor::AllocateStream(Stream* stream) {
728 return AsGpuStream(stream)->Init();
729 }
730
DeallocateStream(Stream * stream)731 void GpuExecutor::DeallocateStream(Stream* stream) {
732 GpuStream* cuda_stream = AsGpuStream(stream);
733 if (!cuda_stream->IsIdle()) {
734 LOG(ERROR) << "Deallocating stream with pending work";
735 }
736 cuda_stream->Destroy();
737 }
738
AllocateTimer(Timer * timer)739 bool GpuExecutor::AllocateTimer(Timer* timer) {
740 return AsGpuTimer(timer)->Init();
741 }
742
DeallocateTimer(Timer * timer)743 void GpuExecutor::DeallocateTimer(Timer* timer) {
744 AsGpuTimer(timer)->Destroy();
745 }
746
CreateStreamDependency(Stream * dependent,Stream * other)747 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
748 CUevent other_completed_event = *AsGpuStream(other)->completed_event();
749 bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
750 AsGpuStreamValue(other))
751 .ok();
752 if (!ok) {
753 LOG(ERROR) << "failed to record completion event; "
754 "therefore, failed to create inter-stream dependency";
755 return false;
756 }
757
758 return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
759 other_completed_event);
760 }
761
StartTimer(Stream * stream,Timer * timer)762 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
763 return AsGpuTimer(timer)->Start(AsGpuStream(stream));
764 }
765
StopTimer(Stream * stream,Timer * timer)766 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
767 return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
768 }
769
BlockHostUntilDone(Stream * stream)770 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
771 return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
772 }
773
CreateBlas()774 blas::BlasSupport* GpuExecutor::CreateBlas() {
775 PluginRegistry *registry = PluginRegistry::Instance();
776 port::StatusOr<PluginRegistry::BlasFactory> status =
777 registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
778 plugin_config_.blas());
779 if (!status.ok()) {
780 LOG(ERROR) << "Unable to retrieve BLAS factory: "
781 << status.status().error_message();
782 return nullptr;
783 }
784
785 return status.ValueOrDie()(this);
786 }
787
CreateDnn()788 dnn::DnnSupport* GpuExecutor::CreateDnn() {
789 PluginRegistry *registry = PluginRegistry::Instance();
790 port::StatusOr<PluginRegistry::DnnFactory> status =
791 registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
792 plugin_config_.dnn());
793 if (!status.ok()) {
794 LOG(ERROR) << "Unable to retrieve DNN factory: "
795 << status.status().error_message();
796 return nullptr;
797 }
798
799 return status.ValueOrDie()(this);
800 }
801
CreateFft()802 fft::FftSupport* GpuExecutor::CreateFft() {
803 PluginRegistry *registry = PluginRegistry::Instance();
804 port::StatusOr<PluginRegistry::FftFactory> status =
805 registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
806 plugin_config_.fft());
807 if (!status.ok()) {
808 LOG(ERROR) << "Unable to retrieve FFT factory: "
809 << status.status().error_message();
810 return nullptr;
811 }
812
813 return status.ValueOrDie()(this);
814 }
815
CreateRng()816 rng::RngSupport* GpuExecutor::CreateRng() {
817 PluginRegistry *registry = PluginRegistry::Instance();
818 port::StatusOr<PluginRegistry::RngFactory> status =
819 registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
820 plugin_config_.rng());
821 if (!status.ok()) {
822 LOG(ERROR) << "Unable to retrieve RNG factory: "
823 << status.status().error_message();
824 return nullptr;
825 }
826
827 return status.ValueOrDie()(this);
828 }
829
830 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const831 bool GpuExecutor::SupportsDnn() const { return true; }
832
CanEnablePeerAccessTo(StreamExecutorInterface * other)833 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
834 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
835 return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
836 }
837
EnablePeerAccessTo(StreamExecutorInterface * other)838 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
839 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
840 return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
841 }
842
GetDeviceSharedMemoryConfig()843 SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
844 port::StatusOr<CUsharedconfig> cuda_config =
845 GpuDriver::ContextGetSharedMemConfig(context_);
846 if (!cuda_config.ok()) {
847 // Don't log; the failed call will log necessary output.
848 return SharedMemoryConfig::kDefault;
849 }
850
851 switch (cuda_config.ValueOrDie()) {
852 case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE:
853 return SharedMemoryConfig::kDefault;
854 case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:
855 return SharedMemoryConfig::kFourByte;
856 case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE:
857 return SharedMemoryConfig::kEightByte;
858 default:
859 LOG(FATAL) << "Invalid shared memory configuration returned: "
860 << cuda_config.ValueOrDie();
861 }
862 }
863
SetDeviceSharedMemoryConfig(SharedMemoryConfig config)864 port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
865 SharedMemoryConfig config) {
866 CUsharedconfig cuda_config;
867 switch (config) {
868 case SharedMemoryConfig::kDefault:
869 cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
870 break;
871 case SharedMemoryConfig::kFourByte:
872 cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
873 break;
874 case SharedMemoryConfig::kEightByte:
875 cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE;
876 break;
877 default:
878 LOG(FATAL) << "Invalid shared memory configuration specified: "
879 << static_cast<int>(config);
880 }
881 return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
882 }
883
DeviceMemoryUsage(int64 * free,int64 * total) const884 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
885 return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
886 }
887
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)888 bool GpuExecutor::GetSymbol(const string& symbol_name,
889 ModuleHandle module_handle, void** mem,
890 size_t* bytes) {
891 auto lookup_in_module = [&](CUmodule module) {
892 CHECK(module != nullptr);
893 return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
894 reinterpret_cast<CUdeviceptr*>(mem),
895 bytes);
896 };
897
898 { // give limited scope to mutex_lock
899 mutex_lock lock{in_memory_modules_mu_};
900 if (static_cast<bool>(module_handle)) {
901 auto it = gpu_binary_to_module_.find(module_handle.id());
902 CHECK(it != gpu_binary_to_module_.end());
903 return lookup_in_module(it->second.first);
904 }
905
906 for (auto &it : gpu_binary_to_module_) {
907 if (lookup_in_module(it.second.first)) {
908 return true;
909 }
910 }
911 }
912
913 LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
914 return false;
915 }
916
FillBlockDimLimit(BlockDim * block_dim_limit) const917 bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
918 // The BlockDim name is a mismatch against these GRID_DIM_* queries because
919 // we use BlockDims to express the dimensions of blocks within a grid
920 // (as opposed to ThreadDim which expresses the dimensions of threads
921 // within a block).
922 int x, y, z;
923 if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
924 return false;
925 }
926
927 block_dim_limit->x = x;
928 block_dim_limit->y = y;
929 block_dim_limit->z = z;
930 return true;
931 }
932
SupportsBlas() const933 bool GpuExecutor::SupportsBlas() const { return true; }
934
SupportsFft() const935 bool GpuExecutor::SupportsFft() const { return true; }
936
SupportsRng() const937 bool GpuExecutor::SupportsRng() const { return true; }
938
939 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()940 GpuExecutor::CreateEventImplementation() {
941 return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
942 }
943
944 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()945 GpuExecutor::CreateKernelImplementation() {
946 return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
947 }
948
949 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()950 GpuExecutor::GetStreamImplementation() {
951 return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
952 }
953
954 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()955 GpuExecutor::GetTimerImplementation() {
956 return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
957 }
958
GpuContextHack()959 void* GpuExecutor::GpuContextHack() { return context_; }
960
gpu_context()961 GpuContext* GpuExecutor::gpu_context() { return context_; }
962
963 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
964 // of SysFS. Returns -1 if it cannot.
965 //
966 // For anything more complicated/prod-focused than this, you'll likely want to
967 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)968 static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
969 #if defined(__APPLE__)
970 LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
971 return 0;
972 #elif defined(PLATFORM_WINDOWS)
973 // Windows support for NUMA is not currently implemented. Return node 0.
974 return 0;
975 #elif defined(__aarch64__)
976 LOG(INFO) << "ARM64 does not support NUMA - returning NUMA node zero";
977 return 0;
978 #else
979 VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
980 static const int kUnknownNumaNode = -1;
981
982 if (pci_bus_id.empty()) {
983 LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
984 return kUnknownNumaNode;
985 }
986
987 string filename =
988 port::Printf("/sys/bus/pci/devices/%s/numa_node", pci_bus_id.c_str());
989
990 // We have to use fopen/fread here so that the device properties can be
991 // populated before InitGoogle procedure has been completed (at which point we
992 // could use the file::* utilities).
993 FILE *file = fopen(filename.c_str(), "r");
994 if (file == nullptr) {
995 LOG(ERROR) << "could not open file to read NUMA node: " << filename
996 << "\nYour kernel may have been built without NUMA support.";
997 return kUnknownNumaNode;
998 }
999
1000 string content;
1001 char buf[32];
1002 size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
1003 buf[did_read] = '\0';
1004 content = buf;
1005
1006 int32 value;
1007 if (port::safe_strto32(content, &value)) {
1008 if (value < 0) { // See http://b/18228951 for details on this path.
1009 LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
1010 << value << "), but there must be at least one NUMA node"
1011 ", so returning NUMA node zero";
1012 fclose(file);
1013 return 0;
1014 }
1015 fclose(file);
1016 return value;
1017 }
1018
1019 LOG(WARNING)
1020 << "could not convert SysFS file contents to integral NUMA node value: "
1021 << content;
1022
1023 fclose(file);
1024 return kUnknownNumaNode;
1025 #endif
1026 }
1027
PopulateDeviceDescription() const1028 DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
1029 internal::DeviceDescriptionBuilder builder;
1030
1031 {
1032 int driver_version = 0;
1033 (void)GpuDriver::GetDriverVersion(&driver_version);
1034 string augmented_driver_version = port::Printf(
1035 "%d (%s)", driver_version,
1036 cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
1037 .c_str());
1038 builder.set_driver_version(augmented_driver_version);
1039 }
1040
1041 {
1042 string pci_bus_id = GpuDriver::GetPCIBusID(device_);
1043
1044 // Lower the hex characters to match sysfs.
1045 pci_bus_id = port::Lowercase(pci_bus_id);
1046 builder.set_pci_bus_id(pci_bus_id);
1047
1048 // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
1049 int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
1050 builder.set_numa_node(numa_node);
1051 }
1052
1053 {
1054 builder.set_threads_per_block_limit(
1055 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1056 device_)
1057 .ValueOrDie());
1058
1059 ThreadDim thread_dim_limit;
1060 thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1061 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
1062 .ValueOrDie();
1063 thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1064 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
1065 .ValueOrDie();
1066 thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1067 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
1068 .ValueOrDie();
1069 builder.set_thread_dim_limit(thread_dim_limit);
1070
1071 int clock_rate =
1072 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
1073 .ValueOrDie();
1074 builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1075 }
1076
1077 {
1078 bool ecc_enabled = false;
1079 (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
1080 builder.set_ecc_enabled(ecc_enabled);
1081 }
1082
1083 {
1084 uint64 device_memory_size = -1;
1085 (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
1086 builder.set_device_memory_size(device_memory_size);
1087 }
1088
1089 port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1090 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
1091 port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1092 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
1093 if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1094 // Times 2 because HBM is DDR memory; it gets two data bits per each data
1095 // lane.
1096 builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1097 1000 *
1098 int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1099 }
1100
1101 {
1102 BlockDim block_dim_limit;
1103 FillBlockDimLimit(&block_dim_limit);
1104 builder.set_block_dim_limit(block_dim_limit);
1105 }
1106
1107 {
1108 string device_name;
1109 (void)GpuDriver::GetDeviceName(device_, &device_name);
1110 builder.set_name(device_name);
1111 }
1112
1113 builder.set_platform_version(
1114 absl::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
1115
1116 // TODO(leary) should be a way to query this from the driver, but this is
1117 // unlikely to change for us any time soon.
1118 builder.set_device_address_bits(64);
1119
1120 builder.set_device_vendor("NVIDIA Corporation");
1121 builder.set_cuda_compute_capability(cc_major_, cc_minor_);
1122 builder.set_shared_memory_per_core(
1123 GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
1124 builder.set_shared_memory_per_block(
1125 GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
1126 builder.set_core_count(
1127 GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
1128 builder.set_threads_per_core_limit(
1129 GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
1130 builder.set_registers_per_block_limit(
1131 GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
1132 builder.set_threads_per_warp(
1133 GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
1134 builder.set_registers_per_core_limit(
1135 GpuDriver::GetDeviceAttribute(
1136 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
1137 .ValueOrDie());
1138
1139 // We are loading a dummy ptx kernel to set the device description's
1140 // blocks_per_core_limit by calling the CUDA occupancy calculator. This
1141 // value is currently required XLA GPU's CalculateLaunchDimensions()
1142 const char* blank_ptx = R"(
1143 .version 6.0
1144 .target sm_30
1145 .address_size 64
1146
1147 // .globl testkernel
1148 .visible .entry testkernel()
1149 {
1150 ret;
1151 })";
1152 const char* kernel_name = "testkernel";
1153
1154 CUmodule blank_module;
1155 CUfunction blank_function;
1156 int bpc = -1;
1157 bool ptx_success =
1158 cuda::CUDADriver::LoadPtx(context_, blank_ptx, &blank_module);
1159 if (ptx_success) {
1160 ptx_success = cuda::CUDADriver::GetModuleFunction(
1161 context_, blank_module, kernel_name, &blank_function);
1162 if (ptx_success) {
1163 CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
1164 &bpc, blank_function, 1, 1);
1165 if (result != CUDA_SUCCESS) {
1166 bpc = -1;
1167 ptx_success = false;
1168 }
1169 }
1170 cuda::CUDADriver::UnloadModule(context_, blank_module);
1171 }
1172 if (!ptx_success) {
1173 LOG(ERROR) << "Failed to calculate max blocks per SM using dummy kernel.";
1174 }
1175 builder.set_blocks_per_core_limit(bpc);
1176
1177 auto built = builder.Build();
1178 return built.release();
1179 }
1180
1181 } // namespace gpu
1182
initialize_cuda_gpu_executor()1183 void initialize_cuda_gpu_executor() {
1184 *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
1185 return new gpu::GpuExecutor{config};
1186 };
1187 }
1188
1189 } // namespace stream_executor
1190
1191 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {
1192 stream_executor::initialize_cuda_gpu_executor();
1193 });
1194