1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
17
18 #if defined(__APPLE__)
19 #include <mach-o/dyld.h>
20 #endif
21 #if defined(PLATFORM_WINDOWS)
22 #include <windows.h>
23 #define PATH_MAX MAX_PATH
24 #else
25 #include <unistd.h>
26 #endif
27 #include "absl/strings/ascii.h"
28 #include "absl/strings/str_cat.h"
29 #include "absl/strings/str_format.h"
30 #include "absl/strings/str_split.h"
31 #include "absl/strings/string_view.h"
32 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
33 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
34 #include "tensorflow/stream_executor/cuda/cuda_event.h"
35 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
36 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
37 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
38 #include "tensorflow/stream_executor/kernel_cache_config.h"
39 #include "tensorflow/stream_executor/lib/env.h"
40 #include "tensorflow/stream_executor/lib/error.h"
41 #include "tensorflow/stream_executor/lib/initialize.h"
42 #include "tensorflow/stream_executor/lib/mathutil.h"
43 #include "tensorflow/stream_executor/lib/numbers.h"
44 #include "tensorflow/stream_executor/lib/path.h"
45 #include "tensorflow/stream_executor/lib/process_state.h"
46 #include "tensorflow/stream_executor/lib/statusor.h"
47 #include "tensorflow/stream_executor/platform.h"
48 #include "tensorflow/stream_executor/platform/logging.h"
49 #include "tensorflow/stream_executor/platform/port.h"
50 #include "tensorflow/stream_executor/plugin_registry.h"
51 #include "tensorflow/stream_executor/stream.h"
52 #include "tensorflow/stream_executor/stream_executor_internal.h"
53 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
54 #include "tensorflow/stream_executor/timer.h"
55
56 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
57 // always unwanted. This happens on Windows that defines such a macro.
58 #undef ERROR
59
60 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
61 #error \
62 "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
63 #endif
64
65 #ifdef __CUDA_RUNTIME_H__
66 #error \
67 "CUDA runtime being included into CUDA GPU executor; should be driver only."
68 #endif
69
70 extern bool FLAGS_check_gpu_leaks;
71 bool FLAGS_prefer_cubin_to_ptx = true;
72
73 namespace stream_executor {
74 namespace gpu {
75
76 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
77 // It has been observed that loading both PTX and cubins into the driver library
78 // can cause it to crash, but loading only CUBINs avoids those crashes;
79 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
80 // PTX code.
81 //
82 // As this is an implementation-detail workaround, the usage is to declare this
83 // variable with extern linkage and populate it from another translation unit.
84 std::function<std::string(const std::string&)> g_cubinate;
85
AsGpuEvent(Event * event)86 static GpuEvent* AsGpuEvent(Event* event) {
87 DCHECK(event != nullptr);
88 return static_cast<GpuEvent*>(event->implementation());
89 }
90
91 // Given a platform-independent timer datatype, returns the internal CUDA
92 // platform implementation pointer.
AsGpuTimer(Timer * timer)93 static GpuTimer* AsGpuTimer(Timer* timer) {
94 DCHECK(timer != nullptr);
95 return static_cast<GpuTimer*>(timer->implementation());
96 }
97
98 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
99 // for passing directly to libcuda APIs.
100 //
101 // N.B. we must lose constness in order to pass a suitable type to the existing
102 // libcuda APIs, so the caller should take care to only pass the result of const
103 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)104 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
105 return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
106 }
107
108 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)109 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
110 return AsCudaDevicePtr(*gpu_mem);
111 }
112
ExtractGpuContext(GpuExecutor * cuda_exec)113 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
114 CHECK(cuda_exec != nullptr);
115 return cuda_exec->gpu_context();
116 }
117
ExtractGpuExecutor(StreamExecutor * stream_exec)118 GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
119 return static_cast<GpuExecutor*>(stream_exec->implementation());
120 }
121
~GpuExecutor()122 GpuExecutor::~GpuExecutor() {
123 CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
124 CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
125 if (context_ != nullptr) {
126 GpuDriver::DestroyContext(context_);
127 }
128 }
129
Init(int device_ordinal,DeviceOptions device_options)130 port::Status GpuExecutor::Init(int device_ordinal,
131 DeviceOptions device_options) {
132 device_ordinal_ = device_ordinal;
133
134 auto status = GpuDriver::Init();
135 if (!status.ok()) {
136 return status;
137 }
138
139 status = GpuDriver::GetDevice(device_ordinal_, &device_);
140 if (!status.ok()) {
141 return status;
142 }
143
144 status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
145 &context_);
146 if (!status.ok()) {
147 return status;
148 }
149
150 return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
151 }
152
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const153 bool GpuExecutor::FindOnDiskForComputeCapability(
154 absl::string_view filename, absl::string_view canonical_suffix,
155 std::string* found_filename) const {
156 if (cc_major_ == 0 && cc_minor_ == 0) {
157 return false;
158 }
159
160 std::string cc_specific =
161 absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
162 if (port::FileExists(cc_specific).ok()) {
163 VLOG(2) << "found compute-capability-specific file, using that: "
164 << cc_specific;
165 *found_filename = cc_specific;
166 return true;
167 }
168
169 VLOG(2) << "could not find compute-capability specific file at: "
170 << cc_specific;
171 if (port::FileExists(std::string(filename)).ok()) {
172 *found_filename = std::string(filename);
173 return true;
174 }
175
176 return false;
177 }
178
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const179 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
180 absl::string_view canonical_suffix,
181 std::string* found_filename) const {
182 LOG(ERROR)
183 << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
184 return false;
185 }
186 // Returns the path to the running executable.
187 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
188 // Arg: strip_exe: if true, remove the name of the executable itself from the
189 // returned string. Example: calling this from /usr/bin/foo
190 // would return /usr/bin.
GetBinaryDir(bool strip_exe)191 static std::string GetBinaryDir(bool strip_exe) {
192 char exe_path[PATH_MAX] = {0};
193 #if defined(__APPLE__)
194 uint32_t buffer_size = 0U;
195 _NSGetExecutablePath(nullptr, &buffer_size);
196 char unresolved_path[buffer_size];
197 _NSGetExecutablePath(unresolved_path, &buffer_size);
198 CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
199 #else
200 #if defined(PLATFORM_WINDOWS)
201 HMODULE hModule = GetModuleHandle(NULL);
202 GetModuleFileName(hModule, exe_path, MAX_PATH);
203 #else
204 PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
205 #endif
206 #endif
207 // Make sure it's null-terminated:
208 exe_path[sizeof(exe_path) - 1] = 0;
209
210 if (strip_exe) {
211 // The exe is the last component of the path, so remove one component.
212 std::string ret = exe_path;
213 std::vector<std::string> components = absl::StrSplit(exe_path, '/');
214 components.pop_back();
215 return absl::StrJoin(components, "/");
216 }
217 return exe_path;
218 }
219
LoadModuleFromCuBin(const char * cubin,CUmodule * module)220 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
221 CUmodule* module) {
222 uint64_t module_refcount;
223 std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
224
225 if (*module == nullptr) {
226 TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
227 module_refcount = 1;
228 VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
229 << " as module " << *module;
230 } else {
231 ++module_refcount;
232 VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
233 << " is already loaded as module " << *module;
234 }
235 gpu_binary_to_module_[cubin] = {*module, module_refcount};
236 return port::Status::OK();
237 }
238
LoadModuleFromPtx(const char * ptx,CUmodule * module)239 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
240 uint64_t module_refcount;
241 std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
242
243 if (*module == nullptr) {
244 TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
245 VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
246 << *module;
247 module_refcount = 1;
248 } else {
249 ++module_refcount;
250 VLOG(3) << "PTX " << static_cast<const void*>(ptx)
251 << " is already loaded as module " << module;
252 }
253 gpu_binary_to_module_[ptx] = {*module, module_refcount};
254 return port::Status::OK();
255 }
256
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)257 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
258 CUmodule* module) {
259 return port::InternalError(
260 "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
261 }
262
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)263 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
264 KernelBase* kernel) {
265 GpuKernel* cuda_kernel = AsGpuKernel(kernel);
266 CUmodule module;
267 const std::string* kernelname;
268
269 VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
270
271 if (spec.has_cuda_cubin_in_memory()) {
272 absl::MutexLock lock{&in_memory_modules_mu_};
273 kernelname = &spec.cuda_cubin_in_memory().kernelname();
274 const char* cubin = spec.cuda_cubin_in_memory().bytes();
275 TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
276 kernel_to_gpu_binary_[kernel] = cubin;
277 } else if (spec.has_cuda_ptx_in_memory()) {
278 kernelname = &spec.cuda_ptx_in_memory().kernelname();
279
280 if (cc_major_ == 0 && cc_minor_ == 0) {
281 return port::InternalError("Compute capability not set");
282 }
283
284 const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
285 if (ptx == nullptr) {
286 ptx = spec.cuda_ptx_in_memory().default_text();
287 }
288 if (ptx == nullptr) {
289 LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
290 }
291
292 absl::MutexLock lock{&in_memory_modules_mu_};
293 TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
294 kernel_to_gpu_binary_[kernel] = ptx;
295 } else {
296 return port::InternalError("No method of loading CUDA kernel provided");
297 }
298 VLOG(2) << "getting function " << *kernelname << " from module " << module;
299 if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
300 cuda_kernel->gpu_function_ptr())) {
301 return port::InternalError("Could not find the corresponding function");
302 }
303
304 // We have to trust the kernel loader spec arity because there doesn't appear
305 // to be a way to reflect on the number of expected arguments w/the CUDA API.
306 cuda_kernel->set_arity(spec.arity());
307
308 KernelMetadata kernel_metadata;
309 TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
310 kernel->set_metadata(kernel_metadata);
311 kernel->set_name(*kernelname);
312 return port::Status::OK();
313 }
314
UnloadGpuBinary(const void * gpu_binary)315 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
316 auto module_it = gpu_binary_to_module_.find(gpu_binary);
317 if (gpu_binary_to_module_.end() == module_it) {
318 VLOG(3) << "No loaded CUDA module for " << gpu_binary;
319 return false;
320 }
321 auto& module = module_it->second.first;
322 auto& refcount = module_it->second.second;
323 VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
324 if (--refcount == 0) {
325 VLOG(3) << "Unloading CUDA module " << module;
326 GpuDriver::UnloadModule(context_, module);
327 gpu_binary_to_module_.erase(module_it);
328 }
329 return true;
330 }
331
UnloadKernel(const KernelBase * kernel)332 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
333 VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
334
335 absl::MutexLock lock{&in_memory_modules_mu_};
336 auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
337 if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
338 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
339 << " has never been loaded.";
340 return; // We've never seen this kernel.
341 }
342 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
343 << " has loaded GPU code " << gpu_binary_it->second;
344 UnloadGpuBinary(gpu_binary_it->second);
345 kernel_to_gpu_binary_.erase(gpu_binary_it);
346 }
347
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)348 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
349 ModuleHandle* module_handle) {
350 // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
351 // ModuleHandle::id().
352 CUmodule cu_module;
353 if (spec.has_cuda_cubin_in_memory()) {
354 absl::MutexLock lock{&in_memory_modules_mu_};
355 TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
356 reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
357 &cu_module));
358 *module_handle = ModuleHandle(const_cast<void*>(
359 static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
360 return port::Status::OK();
361 } else if (spec.has_cuda_ptx_in_memory()) {
362 if (cc_major_ == 0 && cc_minor_ == 0) {
363 return port::InternalError("Compute capability not set");
364 }
365
366 if (!spec.cuda_ptx_in_memory()) {
367 return port::InternalError("PTX not found in spec");
368 }
369
370 absl::MutexLock lock{&in_memory_modules_mu_};
371 TF_RETURN_IF_ERROR(
372 LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
373 *module_handle = ModuleHandle(
374 const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
375 return port::Status::OK();
376 }
377 return port::InternalError("No method of loading CUDA module provided");
378 }
379
UnloadModule(ModuleHandle module_handle)380 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
381 const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
382 absl::MutexLock lock{&in_memory_modules_mu_};
383 return UnloadGpuBinary(gpu_binary);
384 }
385
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)386 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
387 KernelMetadata* kernel_metadata) {
388 int value;
389 TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
390 CU_FUNC_ATTRIBUTE_NUM_REGS, *cuda_kernel->gpu_function_ptr(), &value));
391 kernel_metadata->set_registers_per_thread(value);
392
393 TF_RETURN_IF_ERROR(
394 GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
395 *cuda_kernel->gpu_function_ptr(), &value));
396 kernel_metadata->set_shared_memory_bytes(value);
397 return port::Status::OK();
398 }
399
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)400 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
401 const BlockDim& block_dims,
402 const KernelBase& kernel,
403 const KernelArgsArrayBase& args) {
404 CHECK_EQ(kernel.Arity(), args.number_of_arguments());
405 CUstream custream = AsGpuStreamValue(stream);
406 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
407 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
408
409 // Only perform/print the occupancy check once. Even just checking to see
410 // whether we've done an occupancy check on this kernel before isn't free
411 // (because we have to synchronize), so we only do this at -v 2+.
412 if (VLOG_IS_ON(2)) {
413 absl::MutexLock lock(&launched_kernels_mu_);
414 if (!launched_kernels_.count(cufunc)) {
415 VlogOccupancyInfo(kernel, thread_dims, block_dims);
416 // TODO(rspringer): Remove elements from launched_kernels_...if we ever
417 // expose a kernel/module deallocation method.
418 launched_kernels_.insert(cufunc);
419 }
420 }
421
422 if (cuda_kernel->GetPreferredCacheConfig() !=
423 KernelCacheConfig::kNoPreference) {
424 TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
425 cufunc, cuda_kernel->GetGpuCacheConfig()));
426 }
427
428 void** kernel_params = const_cast<void**>(args.argument_addresses().data());
429
430 return GpuDriver::LaunchKernel(
431 context_, cufunc, block_dims.x, block_dims.y, block_dims.z, thread_dims.x,
432 thread_dims.y, thread_dims.z, args.number_of_shared_bytes(), custream,
433 kernel_params, nullptr /* = extra */);
434 }
435
436 // This is a non-essential operation; if there's a failure, proceed without
437 // logging an error. It's nearly certain that in case of failures, we'd never
438 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)439 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
440 const ThreadDim& thread_dims,
441 const BlockDim& block_dims) {
442 VLOG(2) << "Computing kernel occupancy for kernel "
443 << kernel.demangled_name();
444 VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
445 << ", " << thread_dims.z << ")";
446
447 int regs_per_thread;
448 if (!kernel.metadata().registers_per_thread(®s_per_thread)) {
449 return;
450 }
451
452 int smem_per_block;
453 if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
454 return;
455 }
456
457 const DeviceDescription& device_description =
458 kernel.parent()->GetDeviceDescription();
459
460 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
461 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
462
463 int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
464 smem_per_block, thread_dims, cufunc);
465 VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
466
467 int suggested_threads =
468 CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
469 smem_per_block, thread_dims, cufunc);
470 if (suggested_threads != 0) {
471 VLOG(2) << "The cuda occupancy calculator recommends using "
472 << suggested_threads
473 << " threads per block to achieve an occupancy of " << blocks_per_sm
474 << " blocks per SM.";
475 }
476 }
477
478 // Compute and return maximum blocks per core (occupancy) based on the
479 // device description, some kernel characteristics and the number of threads per
480 // block. If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)481 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
482 uint64 registers_per_thread,
483 uint64 shared_memory_per_block,
484 const ThreadDim& thread_dims,
485 CUfunction func) {
486 int suggested_blocks = 0;
487 int suggested_threads = 0;
488 CUresult err = cuOccupancyMaxPotentialBlockSize(
489 &suggested_blocks, &suggested_threads, func, nullptr,
490 shared_memory_per_block, 0);
491 CHECK_EQ(err, CUDA_SUCCESS);
492 return suggested_blocks;
493 }
494
495 // Compute and return the suggested thread count to achieve ideal occupancy.
496 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)497 int GpuExecutor::CompareOccupancy(int* initial_blocks,
498 const DeviceDescription& device_description,
499 uint64 registers_per_thread,
500 uint64 shared_memory_per_block,
501 const ThreadDim& thread_dims,
502 CUfunction func) {
503 int suggested_blocks = 0;
504 int suggested_threads = 0;
505 CUresult err = cuOccupancyMaxPotentialBlockSize(
506 &suggested_blocks, &suggested_threads, func, nullptr,
507 shared_memory_per_block, 0);
508 CHECK_EQ(err, CUDA_SUCCESS);
509 if (suggested_blocks > *initial_blocks) {
510 *initial_blocks = suggested_blocks;
511 return suggested_threads;
512 } else {
513 return 0;
514 }
515 }
516
Allocate(uint64 size,int64 memory_space)517 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
518 CHECK_EQ(memory_space, 0);
519 return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
520 }
521
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)522 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
523 uint64 size_bytes) {
524 // offset and size are in bytes, so char* works as the pointer type.
525 return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
526 }
527
Deallocate(DeviceMemoryBase * mem)528 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
529 GpuDriver::DeviceDeallocate(context_, mem->opaque());
530 }
531
HostMemoryRegister(void * location,uint64 size)532 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
533 if (location == nullptr || size == 0) {
534 LOG(WARNING) << "attempting to register null or zero-sized memory: "
535 << location << "; size " << size;
536 }
537 VLOG(2) << "registering " << location << " size " << size;
538 return GpuDriver::HostRegister(context_, location, size);
539 }
540
HostMemoryUnregister(void * location)541 bool GpuExecutor::HostMemoryUnregister(void* location) {
542 VLOG(2) << "unregistering " << location;
543 return GpuDriver::HostUnregister(context_, location);
544 }
545
SynchronizeAllActivity()546 bool GpuExecutor::SynchronizeAllActivity() {
547 return GpuDriver::SynchronizeContext(context_);
548 }
549
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)550 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
551 uint64 size) {
552 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
553 size % 4 == 0) {
554 return GpuDriver::SynchronousMemsetUint32(
555 context_, AsCudaDevicePtr(location), 0x0, size / 4);
556 }
557 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
558 0x0, size);
559 }
560
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)561 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
562 int value, uint64 size) {
563 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
564 size % 4 == 0) {
565 // cudaMemset reinterprets "value" as a uint8.
566 uint8 byte_value = static_cast<uint8>(value);
567 uint32 pattern = (byte_value << 24) | (byte_value << 16) |
568 (byte_value << 8) | byte_value;
569 return GpuDriver::SynchronousMemsetUint32(
570 context_, AsCudaDevicePtr(location), pattern, size / 4);
571 }
572 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
573 value, size);
574 }
575
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)576 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
577 const void* host_src, uint64 size) {
578 return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
579 host_src, size);
580 }
581
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)582 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
583 const DeviceMemoryBase& gpu_src,
584 uint64 size) {
585 return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
586 AsCudaDevicePtr(gpu_src), size);
587 }
588
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)589 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
590 DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
591 return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
592 AsCudaDevicePtr(gpu_src), size);
593 }
594
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)595 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
596 uint64 size) {
597 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
598 size % 4 == 0) {
599 return Memset32(stream, location, 0x0, size);
600 } else {
601 return Memset(stream, location, 0x0, size);
602 }
603 }
604
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)605 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
606 uint8 pattern, uint64 size) {
607 VLOG(2) << "enqueueing memset8 operation onto stream " << stream
608 << " at location " << location << " with size " << size
609 << " and pattern " << std::hex << pattern;
610 return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
611 pattern, size,
612 AsGpuStreamValue(stream));
613 }
614
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)615 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
616 uint32 pattern, uint64 size) {
617 VLOG(2) << "enqueueing memset32 operation onto stream " << stream
618 << " at location " << location << " with size " << size
619 << " and pattern " << std::hex << pattern;
620 CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
621 size % 4 == 0);
622 return GpuDriver::AsynchronousMemsetUint32(
623 context_, AsCudaDevicePtr(location), pattern, size / 4,
624 AsGpuStreamValue(stream));
625 }
626
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)627 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
628 const DeviceMemoryBase& gpu_src, uint64 size) {
629 return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
630 AsCudaDevicePtr(gpu_src), size,
631 AsGpuStreamValue(stream));
632 }
633
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)634 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
635 const void* host_src, uint64 size) {
636 return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
637 host_src, size,
638 AsGpuStreamValue(stream));
639 }
640
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)641 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
642 DeviceMemoryBase* gpu_dst,
643 const DeviceMemoryBase& gpu_src,
644 uint64 size) {
645 return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
646 AsCudaDevicePtr(gpu_src), size,
647 AsGpuStreamValue(stream));
648 }
649
HostCallback(Stream * stream,std::function<port::Status ()> callback)650 bool GpuExecutor::HostCallback(Stream* stream,
651 std::function<port::Status()> callback) {
652 auto callback_ptr = new std::function<void()>([callback]() {
653 port::Status s = callback();
654 if (!s.ok()) {
655 LOG(WARNING) << "Host callback failed: " << s;
656 }
657 });
658 return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
659 InternalHostCallback, callback_ptr);
660 }
661
InternalHostCallback(CUstream stream,CUresult status,void * data)662 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
663 CUresult status,
664 void* data) {
665 std::function<void()>* callback =
666 reinterpret_cast<std::function<void()>*>(data);
667 (*callback)();
668 delete callback;
669 }
670
AllocateEvent(Event * event)671 port::Status GpuExecutor::AllocateEvent(Event* event) {
672 return AsGpuEvent(event)->Init();
673 }
674
DeallocateEvent(Event * event)675 port::Status GpuExecutor::DeallocateEvent(Event* event) {
676 return AsGpuEvent(event)->Destroy();
677 }
678
RecordEvent(Stream * stream,Event * event)679 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
680 return AsGpuEvent(event)->Record(AsGpuStream(stream));
681 }
682
WaitForEvent(Stream * stream,Event * event)683 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
684 if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
685 AsGpuEvent(event)->gpu_event())) {
686 return port::Status::OK();
687 } else {
688 return port::Status(
689 port::error::INTERNAL,
690 absl::StrFormat("error recording waiting for CUDA event on stream %p",
691 stream));
692 }
693 }
694
PollForEventStatus(Event * event)695 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
696 return AsGpuEvent(event)->PollForStatus();
697 }
698
AllocateStream(Stream * stream)699 bool GpuExecutor::AllocateStream(Stream* stream) {
700 return AsGpuStream(stream)->Init();
701 }
702
DeallocateStream(Stream * stream)703 void GpuExecutor::DeallocateStream(Stream* stream) {
704 GpuStream* cuda_stream = AsGpuStream(stream);
705 if (!cuda_stream->IsIdle()) {
706 LOG(ERROR) << "Deallocating stream with pending work";
707 }
708 cuda_stream->Destroy();
709 }
710
AllocateTimer(Timer * timer)711 bool GpuExecutor::AllocateTimer(Timer* timer) {
712 return AsGpuTimer(timer)->Init();
713 }
714
DeallocateTimer(Timer * timer)715 void GpuExecutor::DeallocateTimer(Timer* timer) {
716 AsGpuTimer(timer)->Destroy();
717 }
718
CreateStreamDependency(Stream * dependent,Stream * other)719 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
720 CUevent other_completed_event = *AsGpuStream(other)->completed_event();
721 bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
722 AsGpuStreamValue(other))
723 .ok();
724 if (!ok) {
725 LOG(ERROR) << "failed to record completion event; "
726 "therefore, failed to create inter-stream dependency";
727 return false;
728 }
729
730 return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
731 other_completed_event);
732 }
733
StartTimer(Stream * stream,Timer * timer)734 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
735 return AsGpuTimer(timer)->Start(AsGpuStream(stream));
736 }
737
StopTimer(Stream * stream,Timer * timer)738 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
739 return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
740 }
741
BlockHostUntilDone(Stream * stream)742 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
743 return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
744 }
745
CreateBlas()746 blas::BlasSupport* GpuExecutor::CreateBlas() {
747 PluginRegistry* registry = PluginRegistry::Instance();
748 port::StatusOr<PluginRegistry::BlasFactory> status =
749 registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
750 plugin_config_.blas());
751 if (!status.ok()) {
752 LOG(ERROR) << "Unable to retrieve BLAS factory: "
753 << status.status().error_message();
754 return nullptr;
755 }
756
757 return status.ValueOrDie()(this);
758 }
759
CreateDnn()760 dnn::DnnSupport* GpuExecutor::CreateDnn() {
761 PluginRegistry* registry = PluginRegistry::Instance();
762 port::StatusOr<PluginRegistry::DnnFactory> status =
763 registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
764 plugin_config_.dnn());
765 if (!status.ok()) {
766 LOG(ERROR) << "Unable to retrieve DNN factory: "
767 << status.status().error_message();
768 return nullptr;
769 }
770
771 return status.ValueOrDie()(this);
772 }
773
CreateFft()774 fft::FftSupport* GpuExecutor::CreateFft() {
775 PluginRegistry* registry = PluginRegistry::Instance();
776 port::StatusOr<PluginRegistry::FftFactory> status =
777 registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
778 plugin_config_.fft());
779 if (!status.ok()) {
780 LOG(ERROR) << "Unable to retrieve FFT factory: "
781 << status.status().error_message();
782 return nullptr;
783 }
784
785 return status.ValueOrDie()(this);
786 }
787
CreateRng()788 rng::RngSupport* GpuExecutor::CreateRng() {
789 PluginRegistry* registry = PluginRegistry::Instance();
790 port::StatusOr<PluginRegistry::RngFactory> status =
791 registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
792 plugin_config_.rng());
793 if (!status.ok()) {
794 LOG(ERROR) << "Unable to retrieve RNG factory: "
795 << status.status().error_message();
796 return nullptr;
797 }
798
799 return status.ValueOrDie()(this);
800 }
801
802 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const803 bool GpuExecutor::SupportsDnn() const { return true; }
804
CanEnablePeerAccessTo(StreamExecutorInterface * other)805 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
806 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
807 return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
808 }
809
EnablePeerAccessTo(StreamExecutorInterface * other)810 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
811 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
812 return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
813 }
814
DeviceMemoryUsage(int64 * free,int64 * total) const815 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
816 return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
817 }
818
GetSymbol(const std::string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)819 bool GpuExecutor::GetSymbol(const std::string& symbol_name,
820 ModuleHandle module_handle, void** mem,
821 size_t* bytes) {
822 auto lookup_in_module = [&](CUmodule module) {
823 CHECK(module != nullptr);
824 return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
825 reinterpret_cast<CUdeviceptr*>(mem),
826 bytes);
827 };
828
829 { // give limited scope to mutex_lock
830 absl::MutexLock lock{&in_memory_modules_mu_};
831 if (static_cast<bool>(module_handle)) {
832 auto it = gpu_binary_to_module_.find(module_handle.id());
833 CHECK(it != gpu_binary_to_module_.end());
834 return lookup_in_module(it->second.first);
835 }
836
837 for (auto& it : gpu_binary_to_module_) {
838 if (lookup_in_module(it.second.first)) {
839 return true;
840 }
841 }
842 }
843
844 LOG(INFO) << "Failed to find symbol in any modules: " << symbol_name;
845 return false;
846 }
847
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)848 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
849 // The BlockDim name is a mismatch against these GRID_DIM_* queries because
850 // we use BlockDims to express the dimensions of blocks within a grid
851 // (as opposed to ThreadDim which expresses the dimensions of threads
852 // within a block).
853 int x, y, z;
854 if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
855 return false;
856 }
857
858 block_dim_limit->x = x;
859 block_dim_limit->y = y;
860 block_dim_limit->z = z;
861 return true;
862 }
863
SupportsBlas() const864 bool GpuExecutor::SupportsBlas() const { return true; }
865
SupportsFft() const866 bool GpuExecutor::SupportsFft() const { return true; }
867
SupportsRng() const868 bool GpuExecutor::SupportsRng() const { return true; }
869
870 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()871 GpuExecutor::CreateEventImplementation() {
872 return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
873 }
874
875 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()876 GpuExecutor::CreateKernelImplementation() {
877 return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
878 }
879
880 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()881 GpuExecutor::GetStreamImplementation() {
882 return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
883 }
884
885 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()886 GpuExecutor::GetTimerImplementation() {
887 return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
888 }
889
GpuContextHack()890 void* GpuExecutor::GpuContextHack() { return context_; }
891
gpu_context()892 GpuContext* GpuExecutor::gpu_context() { return context_; }
893
894 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
895 // of SysFS. Returns -1 if it cannot.
896 //
897 // For anything more complicated/prod-focused than this, you'll likely want to
898 // turn to gsys' topology modeling.
TryToReadNumaNode(const std::string & pci_bus_id,int device_ordinal)899 static int TryToReadNumaNode(const std::string& pci_bus_id,
900 int device_ordinal) {
901 #if defined(__APPLE__)
902 LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
903 return 0;
904 #elif defined(PLATFORM_WINDOWS)
905 // Windows support for NUMA is not currently implemented. Return node 0.
906 return 0;
907 #elif defined(__aarch64__)
908 LOG(INFO) << "ARM64 does not support NUMA - returning NUMA node zero";
909 return 0;
910 #else
911 VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
912 static const int kUnknownNumaNode = -1;
913
914 if (pci_bus_id.empty()) {
915 LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
916 return kUnknownNumaNode;
917 }
918
919 std::string filename =
920 absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
921
922 // We have to use fopen/fread here so that the device properties can be
923 // populated before InitGoogle procedure has been completed (at which point we
924 // could use the file::* utilities).
925 FILE* file = fopen(filename.c_str(), "r");
926 if (file == nullptr) {
927 LOG(INFO) << "could not open file to read NUMA node: " << filename
928 << "\nYour kernel may have been built without NUMA support.";
929 return kUnknownNumaNode;
930 }
931
932 std::string content;
933 char buf[32];
934 size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
935 buf[did_read] = '\0';
936 content = buf;
937
938 int32 value;
939 if (port::safe_strto32(content, &value)) {
940 if (value < 0) { // See http://b/18228951 for details on this path.
941 LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
942 << value
943 << "), but there must be at least one NUMA node"
944 ", so returning NUMA node zero";
945 fclose(file);
946 return 0;
947 }
948 fclose(file);
949 return value;
950 }
951
952 LOG(WARNING)
953 << "could not convert SysFS file contents to integral NUMA node value: "
954 << content;
955
956 fclose(file);
957 return kUnknownNumaNode;
958 #endif
959 }
960
961 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)962 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
963 GpuDeviceHandle device;
964 auto status = GpuDriver::GetDevice(device_ordinal, &device);
965 if (!status.ok()) {
966 return status;
967 }
968
969 int cc_major;
970 int cc_minor;
971 status = GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device);
972 if (!status.ok()) {
973 return status;
974 }
975
976 internal::DeviceDescriptionBuilder builder;
977
978 {
979 int driver_version = 0;
980 (void)GpuDriver::GetDriverVersion(&driver_version);
981 std::string augmented_driver_version = absl::StrFormat(
982 "%d (%s)", driver_version,
983 cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion()));
984 builder.set_driver_version(augmented_driver_version);
985 }
986
987 {
988 std::string pci_bus_id = GpuDriver::GetPCIBusID(device);
989
990 // Lower the hex characters to match sysfs.
991 pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
992 builder.set_pci_bus_id(pci_bus_id);
993
994 // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
995 int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
996 builder.set_numa_node(numa_node);
997 }
998
999 {
1000 builder.set_threads_per_block_limit(
1001 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1002 device)
1003 .ValueOrDie());
1004
1005 ThreadDim thread_dim_limit;
1006 thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1007 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device)
1008 .ValueOrDie();
1009 thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1010 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device)
1011 .ValueOrDie();
1012 thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1013 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
1014 .ValueOrDie();
1015 builder.set_thread_dim_limit(thread_dim_limit);
1016
1017 int clock_rate =
1018 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
1019 .ValueOrDie();
1020 builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1021 }
1022
1023 {
1024 bool ecc_enabled = false;
1025 (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
1026 builder.set_ecc_enabled(ecc_enabled);
1027 }
1028
1029 {
1030 uint64 device_memory_size = -1;
1031 (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
1032 builder.set_device_memory_size(device_memory_size);
1033 }
1034
1035 port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1036 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
1037 port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1038 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
1039 if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1040 // Times 2 because HBM is DDR memory; it gets two data bits per each data
1041 // lane.
1042 builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1043 1000 *
1044 int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1045 }
1046
1047 {
1048 BlockDim block_dim_limit;
1049 FillBlockDimLimit(device, &block_dim_limit);
1050 builder.set_block_dim_limit(block_dim_limit);
1051 }
1052
1053 {
1054 std::string device_name;
1055 TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
1056 builder.set_name(device_name);
1057 }
1058
1059 builder.set_platform_version(
1060 absl::StrCat("Compute Capability ", cc_major, ".", cc_minor));
1061
1062 // TODO(leary) should be a way to query this from the driver, but this is
1063 // unlikely to change for us any time soon.
1064 builder.set_device_address_bits(64);
1065
1066 builder.set_device_vendor("NVIDIA Corporation");
1067 builder.set_cuda_compute_capability(cc_major, cc_minor);
1068 builder.set_shared_memory_per_core(
1069 GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
1070 builder.set_shared_memory_per_block(
1071 GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
1072 builder.set_core_count(
1073 GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
1074 builder.set_threads_per_core_limit(
1075 GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
1076 builder.set_registers_per_block_limit(
1077 GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
1078 builder.set_threads_per_warp(
1079 GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
1080 builder.set_registers_per_core_limit(
1081 GpuDriver::GetDeviceAttribute(
1082 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
1083 .ValueOrDie());
1084
1085 return builder.Build();
1086 }
1087
1088 } // namespace gpu
1089
1090 } // namespace stream_executor
1091
1092 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
1093