1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
17
18 #if defined(__APPLE__)
19 #include <mach-o/dyld.h>
20 #endif
21 #if defined(PLATFORM_WINDOWS)
22 #include <windows.h>
23 #define PATH_MAX MAX_PATH
24 #else
25 #include <unistd.h>
26 #endif
27 #include "absl/strings/ascii.h"
28 #include "absl/strings/str_cat.h"
29 #include "absl/strings/str_format.h"
30 #include "absl/strings/str_split.h"
31 #include "absl/strings/string_view.h"
32 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
33 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
34 #include "tensorflow/stream_executor/cuda/cuda_event.h"
35 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
36 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
37 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
38 #include "tensorflow/stream_executor/kernel_cache_config.h"
39 #include "tensorflow/stream_executor/lib/env.h"
40 #include "tensorflow/stream_executor/lib/error.h"
41 #include "tensorflow/stream_executor/lib/initialize.h"
42 #include "tensorflow/stream_executor/lib/mathutil.h"
43 #include "tensorflow/stream_executor/lib/numbers.h"
44 #include "tensorflow/stream_executor/lib/path.h"
45 #include "tensorflow/stream_executor/lib/process_state.h"
46 #include "tensorflow/stream_executor/lib/statusor.h"
47 #include "tensorflow/stream_executor/platform.h"
48 #include "tensorflow/stream_executor/platform/logging.h"
49 #include "tensorflow/stream_executor/platform/port.h"
50 #include "tensorflow/stream_executor/plugin_registry.h"
51 #include "tensorflow/stream_executor/stream.h"
52 #include "tensorflow/stream_executor/stream_executor_internal.h"
53 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
54 #include "tensorflow/stream_executor/timer.h"
55
56 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
57 // always unwanted. This happens on Windows that defines such a macro.
58 #undef ERROR
59
60 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
61 #error \
62 "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
63 #endif
64
65 #ifdef __CUDA_RUNTIME_H__
66 #error \
67 "CUDA runtime being included into CUDA GPU executor; should be driver only."
68 #endif
69
70 extern bool FLAGS_check_gpu_leaks;
71 bool FLAGS_prefer_cubin_to_ptx = true;
72
73 namespace stream_executor {
74 namespace gpu {
75
76 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
77 // It has been observed that loading both PTX and cubins into the driver library
78 // can cause it to crash, but loading only CUBINs avoids those crashes;
79 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
80 // PTX code.
81 //
82 // As this is an implementation-detail workaround, the usage is to declare this
83 // variable with extern linkage and populate it from another translation unit.
84 std::function<std::string(const std::string&)> g_cubinate;
85
AsGpuEvent(Event * event)86 static GpuEvent* AsGpuEvent(Event* event) {
87 DCHECK(event != nullptr);
88 return static_cast<GpuEvent*>(event->implementation());
89 }
90
91 // Given a platform-independent timer datatype, returns the internal CUDA
92 // platform implementation pointer.
AsGpuTimer(Timer * timer)93 static GpuTimer* AsGpuTimer(Timer* timer) {
94 DCHECK(timer != nullptr);
95 return static_cast<GpuTimer*>(timer->implementation());
96 }
97
98 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
99 // for passing directly to libcuda APIs.
100 //
101 // N.B. we must lose constness in order to pass a suitable type to the existing
102 // libcuda APIs, so the caller should take care to only pass the result of const
103 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)104 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
105 return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
106 }
107
108 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)109 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
110 return AsCudaDevicePtr(*gpu_mem);
111 }
112
ExtractGpuContext(GpuExecutor * cuda_exec)113 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
114 CHECK(cuda_exec != nullptr);
115 return cuda_exec->gpu_context();
116 }
117
~GpuExecutor()118 GpuExecutor::~GpuExecutor() {
119 CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
120 CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
121 if (context_ != nullptr) {
122 GpuDriver::DestroyContext(context_);
123 }
124 }
125
Init(int device_ordinal,DeviceOptions device_options)126 port::Status GpuExecutor::Init(int device_ordinal,
127 DeviceOptions device_options) {
128 device_ordinal_ = device_ordinal;
129
130 auto status = GpuDriver::Init();
131 if (!status.ok()) {
132 return status;
133 }
134
135 status = GpuDriver::GetDevice(device_ordinal_, &device_);
136 if (!status.ok()) {
137 return status;
138 }
139
140 status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
141 &context_);
142 if (!status.ok()) {
143 return status;
144 }
145
146 return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
147 }
148
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const149 bool GpuExecutor::FindOnDiskForComputeCapability(
150 absl::string_view filename, absl::string_view canonical_suffix,
151 std::string* found_filename) const {
152 if (cc_major_ == 0 && cc_minor_ == 0) {
153 return false;
154 }
155
156 std::string cc_specific =
157 absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
158 if (port::FileExists(cc_specific).ok()) {
159 VLOG(2) << "found compute-capability-specific file, using that: "
160 << cc_specific;
161 *found_filename = cc_specific;
162 return true;
163 }
164
165 VLOG(2) << "could not find compute-capability specific file at: "
166 << cc_specific;
167 if (port::FileExists(std::string(filename)).ok()) {
168 *found_filename = std::string(filename);
169 return true;
170 }
171
172 return false;
173 }
174
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const175 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
176 absl::string_view canonical_suffix,
177 std::string* found_filename) const {
178 LOG(ERROR)
179 << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
180 return false;
181 }
182 // Returns the path to the running executable.
183 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
184 // Arg: strip_exe: if true, remove the name of the executable itself from the
185 // returned string. Example: calling this from /usr/bin/foo
186 // would return /usr/bin.
GetBinaryDir(bool strip_exe)187 static std::string GetBinaryDir(bool strip_exe) {
188 char exe_path[PATH_MAX] = {0};
189 #if defined(__APPLE__)
190 uint32_t buffer_size = 0U;
191 _NSGetExecutablePath(nullptr, &buffer_size);
192 char unresolved_path[buffer_size];
193 _NSGetExecutablePath(unresolved_path, &buffer_size);
194 CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
195 #else
196 #if defined(PLATFORM_WINDOWS)
197 HMODULE hModule = GetModuleHandle(NULL);
198 GetModuleFileName(hModule, exe_path, MAX_PATH);
199 #else
200 PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
201 #endif
202 #endif
203 // Make sure it's null-terminated:
204 exe_path[sizeof(exe_path) - 1] = 0;
205
206 if (strip_exe) {
207 // The exe is the last component of the path, so remove one component.
208 std::string ret = exe_path;
209 std::vector<std::string> components = absl::StrSplit(exe_path, '/');
210 components.pop_back();
211 return absl::StrJoin(components, "/");
212 }
213 return exe_path;
214 }
215
LoadModuleFromCuBin(const char * cubin,CUmodule * module)216 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
217 CUmodule* module) {
218 uint64_t module_refcount;
219 std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
220
221 if (*module == nullptr) {
222 TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
223 module_refcount = 1;
224 VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
225 << " as module " << *module;
226 } else {
227 ++module_refcount;
228 VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
229 << " is already loaded as module " << *module;
230 }
231 gpu_binary_to_module_[cubin] = {*module, module_refcount};
232 return port::Status::OK();
233 }
234
LoadModuleFromPtx(const char * ptx,CUmodule * module)235 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
236 uint64_t module_refcount;
237 std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
238
239 if (*module == nullptr) {
240 TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
241 VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
242 << *module;
243 module_refcount = 1;
244 } else {
245 ++module_refcount;
246 VLOG(3) << "PTX " << static_cast<const void*>(ptx)
247 << " is already loaded as module " << module;
248 }
249 gpu_binary_to_module_[ptx] = {*module, module_refcount};
250 return port::Status::OK();
251 }
252
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)253 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
254 CUmodule* module) {
255 return port::InternalError(
256 "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
257 }
258
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)259 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
260 KernelBase* kernel) {
261 GpuKernel* cuda_kernel = AsGpuKernel(kernel);
262 CUmodule module;
263 const std::string* kernelname;
264
265 VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
266
267 if (spec.has_cuda_cubin_in_memory()) {
268 absl::MutexLock lock{&in_memory_modules_mu_};
269 kernelname = &spec.cuda_cubin_in_memory().kernelname();
270 const char* cubin = spec.cuda_cubin_in_memory().bytes();
271 TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
272 kernel_to_gpu_binary_[kernel] = cubin;
273 } else if (spec.has_cuda_ptx_in_memory()) {
274 kernelname = &spec.cuda_ptx_in_memory().kernelname();
275
276 if (cc_major_ == 0 && cc_minor_ == 0) {
277 return port::InternalError("Compute capability not set");
278 }
279
280 const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
281 if (ptx == nullptr) {
282 ptx = spec.cuda_ptx_in_memory().default_text();
283 }
284 if (ptx == nullptr) {
285 LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
286 }
287
288 absl::MutexLock lock{&in_memory_modules_mu_};
289 TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
290 kernel_to_gpu_binary_[kernel] = ptx;
291 } else {
292 return port::InternalError("No method of loading CUDA kernel provided");
293 }
294 VLOG(2) << "getting function " << *kernelname << " from module " << module;
295 if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
296 cuda_kernel->gpu_function_ptr())) {
297 return port::InternalError("Could not find the corresponding function");
298 }
299
300 // We have to trust the kernel loader spec arity because there doesn't appear
301 // to be a way to reflect on the number of expected arguments w/the CUDA API.
302 cuda_kernel->set_arity(spec.arity());
303
304 KernelMetadata kernel_metadata;
305 TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
306 kernel->set_metadata(kernel_metadata);
307 kernel->set_name(*kernelname);
308 return port::Status::OK();
309 }
310
UnloadGpuBinary(const void * gpu_binary)311 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
312 auto module_it = gpu_binary_to_module_.find(gpu_binary);
313 if (gpu_binary_to_module_.end() == module_it) {
314 VLOG(3) << "No loaded CUDA module for " << gpu_binary;
315 return false;
316 }
317 auto& module = module_it->second.first;
318 auto& refcount = module_it->second.second;
319 VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
320 if (--refcount == 0) {
321 VLOG(3) << "Unloading CUDA module " << module;
322 GpuDriver::UnloadModule(context_, module);
323 gpu_binary_to_module_.erase(module_it);
324 }
325 return true;
326 }
327
UnloadKernel(const KernelBase * kernel)328 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
329 VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
330
331 absl::MutexLock lock{&in_memory_modules_mu_};
332 auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
333 if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
334 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
335 << " has never been loaded.";
336 return; // We've never seen this kernel.
337 }
338 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
339 << " has loaded GPU code " << gpu_binary_it->second;
340 UnloadGpuBinary(gpu_binary_it->second);
341 kernel_to_gpu_binary_.erase(gpu_binary_it);
342 }
343
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)344 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
345 ModuleHandle* module_handle) {
346 // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
347 // ModuleHandle::id().
348 CUmodule cu_module;
349 if (spec.has_cuda_cubin_in_memory()) {
350 absl::MutexLock lock{&in_memory_modules_mu_};
351 TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
352 reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
353 &cu_module));
354 *module_handle = ModuleHandle(const_cast<void*>(
355 static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
356 return port::Status::OK();
357 } else if (spec.has_cuda_ptx_in_memory()) {
358 if (cc_major_ == 0 && cc_minor_ == 0) {
359 return port::InternalError("Compute capability not set");
360 }
361
362 if (!spec.cuda_ptx_in_memory()) {
363 return port::InternalError("PTX not found in spec");
364 }
365
366 absl::MutexLock lock{&in_memory_modules_mu_};
367 TF_RETURN_IF_ERROR(
368 LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
369 *module_handle = ModuleHandle(
370 const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
371 return port::Status::OK();
372 }
373 return port::InternalError("No method of loading CUDA module provided");
374 }
375
UnloadModule(ModuleHandle module_handle)376 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
377 const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
378 absl::MutexLock lock{&in_memory_modules_mu_};
379 return UnloadGpuBinary(gpu_binary);
380 }
381
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)382 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
383 KernelMetadata* kernel_metadata) {
384 int value;
385 TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
386 CU_FUNC_ATTRIBUTE_NUM_REGS, *cuda_kernel->gpu_function_ptr(), &value));
387 kernel_metadata->set_registers_per_thread(value);
388
389 TF_RETURN_IF_ERROR(
390 GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
391 *cuda_kernel->gpu_function_ptr(), &value));
392 kernel_metadata->set_shared_memory_bytes(value);
393 return port::Status::OK();
394 }
395
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)396 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
397 const BlockDim& block_dims,
398 const KernelBase& kernel,
399 const KernelArgsArrayBase& args) {
400 CHECK_EQ(kernel.Arity(), args.number_of_arguments());
401 CUstream custream = AsGpuStreamValue(stream);
402 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
403 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
404
405 // Only perform/print the occupancy check once. Even just checking to see
406 // whether we've done an occupancy check on this kernel before isn't free
407 // (because we have to synchronize), so we only do this at -v 2+.
408 if (VLOG_IS_ON(2)) {
409 absl::MutexLock lock(&launched_kernels_mu_);
410 if (!launched_kernels_.count(cufunc)) {
411 VlogOccupancyInfo(kernel, thread_dims, block_dims);
412 // TODO(rspringer): Remove elements from launched_kernels_...if we ever
413 // expose a kernel/module deallocation method.
414 launched_kernels_.insert(cufunc);
415 }
416 }
417
418 if (cuda_kernel->GetPreferredCacheConfig() !=
419 KernelCacheConfig::kNoPreference) {
420 TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
421 cufunc, cuda_kernel->GetGpuCacheConfig()));
422 }
423
424 void** kernel_params = const_cast<void**>(args.argument_addresses().data());
425
426 return GpuDriver::LaunchKernel(context_, kernel.name(), cufunc, block_dims.x,
427 block_dims.y, block_dims.z, thread_dims.x,
428 thread_dims.y, thread_dims.z,
429 args.number_of_shared_bytes(), custream,
430 kernel_params, nullptr /* = extra */);
431 }
432
433 // This is a non-essential operation; if there's a failure, proceed without
434 // logging an error. It's nearly certain that in case of failures, we'd never
435 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)436 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
437 const ThreadDim& thread_dims,
438 const BlockDim& block_dims) {
439 VLOG(2) << "Computing kernel occupancy for kernel "
440 << kernel.demangled_name();
441 VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
442 << ", " << thread_dims.z << ")";
443
444 int regs_per_thread;
445 if (!kernel.metadata().registers_per_thread(®s_per_thread)) {
446 return;
447 }
448
449 int smem_per_block;
450 if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
451 return;
452 }
453
454 const DeviceDescription& device_description =
455 kernel.parent()->GetDeviceDescription();
456
457 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
458 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
459
460 int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
461 smem_per_block, thread_dims, cufunc);
462 VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
463
464 int suggested_threads =
465 CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
466 smem_per_block, thread_dims, cufunc);
467 if (suggested_threads != 0) {
468 VLOG(2) << "The cuda occupancy calculator recommends using "
469 << suggested_threads
470 << " threads per block to achieve an occupancy of " << blocks_per_sm
471 << " blocks per SM.";
472 }
473 }
474
475 // Compute and return maximum blocks per core (occupancy) based on the
476 // device description, some kernel characteristics and the number of threads per
477 // block. If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)478 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
479 uint64 registers_per_thread,
480 uint64 shared_memory_per_block,
481 const ThreadDim& thread_dims,
482 CUfunction func) {
483 int suggested_blocks = 0;
484 int suggested_threads = 0;
485 CUresult err = cuOccupancyMaxPotentialBlockSize(
486 &suggested_blocks, &suggested_threads, func, nullptr,
487 shared_memory_per_block, 0);
488 CHECK_EQ(err, CUDA_SUCCESS);
489 return suggested_blocks;
490 }
491
492 // Compute and return the suggested thread count to achieve ideal occupancy.
493 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)494 int GpuExecutor::CompareOccupancy(int* initial_blocks,
495 const DeviceDescription& device_description,
496 uint64 registers_per_thread,
497 uint64 shared_memory_per_block,
498 const ThreadDim& thread_dims,
499 CUfunction func) {
500 int suggested_blocks = 0;
501 int suggested_threads = 0;
502 CUresult err = cuOccupancyMaxPotentialBlockSize(
503 &suggested_blocks, &suggested_threads, func, nullptr,
504 shared_memory_per_block, 0);
505 CHECK_EQ(err, CUDA_SUCCESS);
506 if (suggested_blocks > *initial_blocks) {
507 *initial_blocks = suggested_blocks;
508 return suggested_threads;
509 } else {
510 return 0;
511 }
512 }
513
Allocate(uint64 size,int64_t memory_space)514 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64_t memory_space) {
515 CHECK_EQ(memory_space, 0);
516 return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
517 }
518
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)519 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
520 uint64 size_bytes) {
521 // offset and size are in bytes, so char* works as the pointer type.
522 return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
523 }
524
Deallocate(DeviceMemoryBase * mem)525 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
526 GpuDriver::DeviceDeallocate(context_, mem->opaque());
527 }
528
HostMemoryRegister(void * location,uint64 size)529 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
530 if (location == nullptr || size == 0) {
531 LOG(WARNING) << "attempting to register null or zero-sized memory: "
532 << location << "; size " << size;
533 }
534 VLOG(2) << "registering " << location << " size " << size;
535 return GpuDriver::HostRegister(context_, location, size);
536 }
537
HostMemoryUnregister(void * location)538 bool GpuExecutor::HostMemoryUnregister(void* location) {
539 VLOG(2) << "unregistering " << location;
540 return GpuDriver::HostUnregister(context_, location);
541 }
542
SynchronizeAllActivity()543 bool GpuExecutor::SynchronizeAllActivity() {
544 return GpuDriver::SynchronizeContext(context_);
545 }
546
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)547 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
548 uint64 size) {
549 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
550 size % 4 == 0) {
551 return GpuDriver::SynchronousMemsetUint32(
552 context_, AsCudaDevicePtr(location), 0x0, size / 4);
553 }
554 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
555 0x0, size);
556 }
557
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)558 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
559 int value, uint64 size) {
560 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
561 size % 4 == 0) {
562 // cudaMemset reinterprets "value" as a uint8.
563 uint8 byte_value = static_cast<uint8>(value);
564 uint32 pattern = (byte_value << 24) | (byte_value << 16) |
565 (byte_value << 8) | byte_value;
566 return GpuDriver::SynchronousMemsetUint32(
567 context_, AsCudaDevicePtr(location), pattern, size / 4);
568 }
569 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
570 value, size);
571 }
572
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)573 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
574 const void* host_src, uint64 size) {
575 return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
576 host_src, size);
577 }
578
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)579 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
580 const DeviceMemoryBase& gpu_src,
581 uint64 size) {
582 return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
583 AsCudaDevicePtr(gpu_src), size);
584 }
585
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)586 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
587 DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
588 return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
589 AsCudaDevicePtr(gpu_src), size);
590 }
591
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)592 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
593 uint64 size) {
594 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
595 size % 4 == 0) {
596 return Memset32(stream, location, 0x0, size);
597 } else {
598 return Memset(stream, location, 0x0, size);
599 }
600 }
601
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)602 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
603 uint8 pattern, uint64 size) {
604 VLOG(2) << "enqueueing memset8 operation onto stream " << stream
605 << " at location " << location << " with size " << size
606 << " and pattern " << std::hex << pattern;
607 return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
608 pattern, size,
609 AsGpuStreamValue(stream));
610 }
611
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)612 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
613 uint32 pattern, uint64 size) {
614 VLOG(2) << "enqueueing memset32 operation onto stream " << stream
615 << " at location " << location << " with size " << size
616 << " and pattern " << std::hex << pattern;
617 CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
618 size % 4 == 0);
619 return GpuDriver::AsynchronousMemsetUint32(
620 context_, AsCudaDevicePtr(location), pattern, size / 4,
621 AsGpuStreamValue(stream));
622 }
623
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)624 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
625 const DeviceMemoryBase& gpu_src, uint64 size) {
626 return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
627 AsCudaDevicePtr(gpu_src), size,
628 AsGpuStreamValue(stream));
629 }
630
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)631 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
632 const void* host_src, uint64 size) {
633 return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
634 host_src, size,
635 AsGpuStreamValue(stream));
636 }
637
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)638 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
639 DeviceMemoryBase* gpu_dst,
640 const DeviceMemoryBase& gpu_src,
641 uint64 size) {
642 return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
643 AsCudaDevicePtr(gpu_src), size,
644 AsGpuStreamValue(stream));
645 }
646
HostCallback(Stream * stream,std::function<port::Status ()> callback)647 bool GpuExecutor::HostCallback(Stream* stream,
648 std::function<port::Status()> callback) {
649 auto callback_ptr = new std::function<void()>([callback]() {
650 port::Status s = callback();
651 if (!s.ok()) {
652 LOG(WARNING) << "Host callback failed: " << s;
653 }
654 });
655 return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
656 InternalHostCallback, callback_ptr);
657 }
658
InternalHostCallback(CUstream stream,CUresult status,void * data)659 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
660 CUresult status,
661 void* data) {
662 std::function<void()>* callback =
663 reinterpret_cast<std::function<void()>*>(data);
664 (*callback)();
665 delete callback;
666 }
667
AllocateEvent(Event * event)668 port::Status GpuExecutor::AllocateEvent(Event* event) {
669 return AsGpuEvent(event)->Init();
670 }
671
DeallocateEvent(Event * event)672 port::Status GpuExecutor::DeallocateEvent(Event* event) {
673 return AsGpuEvent(event)->Destroy();
674 }
675
RecordEvent(Stream * stream,Event * event)676 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
677 return AsGpuEvent(event)->Record(AsGpuStream(stream));
678 }
679
WaitForEvent(Stream * stream,Event * event)680 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
681 if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
682 AsGpuEvent(event)->gpu_event())) {
683 return port::Status::OK();
684 } else {
685 return port::Status(
686 port::error::INTERNAL,
687 absl::StrFormat("error recording waiting for CUDA event on stream %p",
688 stream));
689 }
690 }
691
PollForEventStatus(Event * event)692 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
693 return AsGpuEvent(event)->PollForStatus();
694 }
695
AllocateStream(Stream * stream)696 bool GpuExecutor::AllocateStream(Stream* stream) {
697 return AsGpuStream(stream)->Init();
698 }
699
DeallocateStream(Stream * stream)700 void GpuExecutor::DeallocateStream(Stream* stream) {
701 GpuStream* cuda_stream = AsGpuStream(stream);
702 if (!cuda_stream->IsIdle()) {
703 LOG(ERROR) << "Deallocating stream with pending work";
704 }
705 cuda_stream->Destroy();
706 }
707
AllocateTimer(Timer * timer)708 bool GpuExecutor::AllocateTimer(Timer* timer) {
709 return AsGpuTimer(timer)->Init();
710 }
711
DeallocateTimer(Timer * timer)712 void GpuExecutor::DeallocateTimer(Timer* timer) {
713 AsGpuTimer(timer)->Destroy();
714 }
715
CreateStreamDependency(Stream * dependent,Stream * other)716 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
717 CUevent other_completed_event = *AsGpuStream(other)->completed_event();
718 bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
719 AsGpuStreamValue(other))
720 .ok();
721 if (!ok) {
722 LOG(ERROR) << "failed to record completion event; "
723 "therefore, failed to create inter-stream dependency";
724 return false;
725 }
726
727 return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
728 other_completed_event);
729 }
730
StartTimer(Stream * stream,Timer * timer)731 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
732 return AsGpuTimer(timer)->Start(AsGpuStream(stream));
733 }
734
StopTimer(Stream * stream,Timer * timer)735 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
736 return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
737 }
738
BlockHostUntilDone(Stream * stream)739 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
740 return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
741 }
742
CreateBlas()743 blas::BlasSupport* GpuExecutor::CreateBlas() {
744 PluginRegistry* registry = PluginRegistry::Instance();
745 port::StatusOr<PluginRegistry::BlasFactory> status =
746 registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
747 plugin_config_.blas());
748 if (!status.ok()) {
749 LOG(ERROR) << "Unable to retrieve BLAS factory: "
750 << status.status().error_message();
751 return nullptr;
752 }
753
754 return status.ValueOrDie()(this);
755 }
756
CreateDnn()757 dnn::DnnSupport* GpuExecutor::CreateDnn() {
758 PluginRegistry* registry = PluginRegistry::Instance();
759 port::StatusOr<PluginRegistry::DnnFactory> status =
760 registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
761 plugin_config_.dnn());
762 if (!status.ok()) {
763 LOG(ERROR) << "Unable to retrieve DNN factory: "
764 << status.status().error_message();
765 return nullptr;
766 }
767
768 return status.ValueOrDie()(this);
769 }
770
CreateFft()771 fft::FftSupport* GpuExecutor::CreateFft() {
772 PluginRegistry* registry = PluginRegistry::Instance();
773 port::StatusOr<PluginRegistry::FftFactory> status =
774 registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
775 plugin_config_.fft());
776 if (!status.ok()) {
777 LOG(ERROR) << "Unable to retrieve FFT factory: "
778 << status.status().error_message();
779 return nullptr;
780 }
781
782 return status.ValueOrDie()(this);
783 }
784
CreateRng()785 rng::RngSupport* GpuExecutor::CreateRng() {
786 PluginRegistry* registry = PluginRegistry::Instance();
787 port::StatusOr<PluginRegistry::RngFactory> status =
788 registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
789 plugin_config_.rng());
790 if (!status.ok()) {
791 LOG(ERROR) << "Unable to retrieve RNG factory: "
792 << status.status().error_message();
793 return nullptr;
794 }
795
796 return status.ValueOrDie()(this);
797 }
798
799 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const800 bool GpuExecutor::SupportsDnn() const { return true; }
801
CanEnablePeerAccessTo(StreamExecutorInterface * other)802 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
803 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
804 return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
805 }
806
EnablePeerAccessTo(StreamExecutorInterface * other)807 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
808 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
809 return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
810 }
811
DeviceMemoryUsage(int64 * free,int64 * total) const812 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
813 return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
814 }
815
GetSymbol(const std::string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)816 bool GpuExecutor::GetSymbol(const std::string& symbol_name,
817 ModuleHandle module_handle, void** mem,
818 size_t* bytes) {
819 auto lookup_in_module = [&](CUmodule module) {
820 CHECK(module != nullptr);
821 return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
822 reinterpret_cast<CUdeviceptr*>(mem),
823 bytes);
824 };
825
826 { // give limited scope to mutex_lock
827 absl::MutexLock lock{&in_memory_modules_mu_};
828 if (static_cast<bool>(module_handle)) {
829 auto it = gpu_binary_to_module_.find(module_handle.id());
830 CHECK(it != gpu_binary_to_module_.end());
831 return lookup_in_module(it->second.first);
832 }
833
834 for (auto& it : gpu_binary_to_module_) {
835 if (lookup_in_module(it.second.first)) {
836 return true;
837 }
838 }
839 }
840
841 LOG(INFO) << "Failed to find symbol in any modules: " << symbol_name;
842 return false;
843 }
844
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)845 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
846 // The BlockDim name is a mismatch against these GRID_DIM_* queries because
847 // we use BlockDims to express the dimensions of blocks within a grid
848 // (as opposed to ThreadDim which expresses the dimensions of threads
849 // within a block).
850 int x, y, z;
851 if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
852 return false;
853 }
854
855 block_dim_limit->x = x;
856 block_dim_limit->y = y;
857 block_dim_limit->z = z;
858 return true;
859 }
860
SupportsBlas() const861 bool GpuExecutor::SupportsBlas() const { return true; }
862
SupportsFft() const863 bool GpuExecutor::SupportsFft() const { return true; }
864
SupportsRng() const865 bool GpuExecutor::SupportsRng() const { return true; }
866
867 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()868 GpuExecutor::CreateEventImplementation() {
869 return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
870 }
871
872 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()873 GpuExecutor::CreateKernelImplementation() {
874 return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
875 }
876
877 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()878 GpuExecutor::GetStreamImplementation() {
879 return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
880 }
881
882 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()883 GpuExecutor::GetTimerImplementation() {
884 return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
885 }
886
GpuContextHack()887 void* GpuExecutor::GpuContextHack() { return context_; }
888
gpu_context()889 GpuContext* GpuExecutor::gpu_context() { return context_; }
890
891 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
892 // of SysFS. Returns -1 if it cannot.
893 //
894 // For anything more complicated/prod-focused than this, you'll likely want to
895 // turn to gsys' topology modeling.
TryToReadNumaNode(const std::string & pci_bus_id,int device_ordinal)896 static int TryToReadNumaNode(const std::string& pci_bus_id,
897 int device_ordinal) {
898 #if defined(__APPLE__)
899 LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
900 return 0;
901 #elif defined(PLATFORM_WINDOWS)
902 // Windows support for NUMA is not currently implemented. Return node 0.
903 return 0;
904 #elif defined(__aarch64__)
905 LOG(INFO) << "ARM64 does not support NUMA - returning NUMA node zero";
906 return 0;
907 #else
908 VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
909 static const int kUnknownNumaNode = -1;
910
911 if (pci_bus_id.empty()) {
912 LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
913 return kUnknownNumaNode;
914 }
915
916 std::string filename =
917 absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
918
919 // We have to use fopen/fread here so that the device properties can be
920 // populated before InitGoogle procedure has been completed (at which point we
921 // could use the file::* utilities).
922 FILE* file = fopen(filename.c_str(), "r");
923 if (file == nullptr) {
924 LOG(INFO) << "could not open file to read NUMA node: " << filename
925 << "\nYour kernel may have been built without NUMA support.";
926 return kUnknownNumaNode;
927 }
928
929 std::string content;
930 char buf[32];
931 size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
932 buf[did_read] = '\0';
933 content = buf;
934
935 int32_t value;
936 if (port::safe_strto32(content, &value)) {
937 if (value < 0) { // See http://b/18228951 for details on this path.
938 LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
939 << value
940 << "), but there must be at least one NUMA node"
941 ", so returning NUMA node zero";
942 fclose(file);
943 return 0;
944 }
945 fclose(file);
946 return value;
947 }
948
949 LOG(WARNING)
950 << "could not convert SysFS file contents to integral NUMA node value: "
951 << content;
952
953 fclose(file);
954 return kUnknownNumaNode;
955 #endif
956 }
957
958 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)959 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
960 GpuDeviceHandle device;
961 auto status = GpuDriver::GetDevice(device_ordinal, &device);
962 if (!status.ok()) {
963 return status;
964 }
965
966 int cc_major;
967 int cc_minor;
968 status = GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device);
969 if (!status.ok()) {
970 return status;
971 }
972
973 internal::DeviceDescriptionBuilder builder;
974
975 {
976 int driver_version = 0;
977 (void)GpuDriver::GetDriverVersion(&driver_version);
978 std::string augmented_driver_version = absl::StrFormat(
979 "%d (%s)", driver_version,
980 cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion()));
981 builder.set_driver_version(augmented_driver_version);
982 }
983
984 {
985 std::string pci_bus_id = GpuDriver::GetPCIBusID(device);
986
987 // Lower the hex characters to match sysfs.
988 pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
989 builder.set_pci_bus_id(pci_bus_id);
990
991 // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
992 int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
993 builder.set_numa_node(numa_node);
994 }
995
996 {
997 builder.set_threads_per_block_limit(
998 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
999 device)
1000 .ValueOrDie());
1001
1002 ThreadDim thread_dim_limit;
1003 thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1004 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device)
1005 .ValueOrDie();
1006 thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1007 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device)
1008 .ValueOrDie();
1009 thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1010 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
1011 .ValueOrDie();
1012 builder.set_thread_dim_limit(thread_dim_limit);
1013
1014 int clock_rate =
1015 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
1016 .ValueOrDie();
1017 builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1018 }
1019
1020 {
1021 bool ecc_enabled = false;
1022 (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
1023 builder.set_ecc_enabled(ecc_enabled);
1024 }
1025
1026 {
1027 uint64 device_memory_size = -1;
1028 (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
1029 builder.set_device_memory_size(device_memory_size);
1030 }
1031
1032 port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1033 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
1034 port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1035 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
1036 if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1037 // Times 2 because HBM is DDR memory; it gets two data bits per each data
1038 // lane.
1039 builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1040 1000 *
1041 int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1042 }
1043
1044 {
1045 BlockDim block_dim_limit;
1046 FillBlockDimLimit(device, &block_dim_limit);
1047 builder.set_block_dim_limit(block_dim_limit);
1048 }
1049
1050 {
1051 std::string device_name;
1052 TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
1053 builder.set_name(device_name);
1054 }
1055
1056 builder.set_platform_version(
1057 absl::StrCat("Compute Capability ", cc_major, ".", cc_minor));
1058
1059 // TODO(leary) should be a way to query this from the driver, but this is
1060 // unlikely to change for us any time soon.
1061 builder.set_device_address_bits(64);
1062
1063 builder.set_device_vendor("NVIDIA Corporation");
1064 builder.set_cuda_compute_capability(cc_major, cc_minor);
1065 builder.set_shared_memory_per_core(
1066 GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
1067 builder.set_shared_memory_per_block(
1068 GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
1069 builder.set_core_count(
1070 GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
1071 builder.set_threads_per_core_limit(
1072 GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
1073 builder.set_registers_per_block_limit(
1074 GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
1075 builder.set_threads_per_warp(
1076 GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
1077 builder.set_registers_per_core_limit(
1078 GpuDriver::GetDeviceAttribute(
1079 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
1080 .ValueOrDie());
1081
1082 return builder.Build();
1083 }
1084
1085 } // namespace gpu
1086
1087 } // namespace stream_executor
1088
1089 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
1090