• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
17 
18 #if defined(__APPLE__)
19 #include <mach-o/dyld.h>
20 #endif
21 #if defined(PLATFORM_WINDOWS)
22 #include <windows.h>
23 #define PATH_MAX MAX_PATH
24 #else
25 #include <unistd.h>
26 #endif
27 #include "absl/strings/ascii.h"
28 #include "absl/strings/str_cat.h"
29 #include "absl/strings/str_format.h"
30 #include "absl/strings/str_split.h"
31 #include "absl/strings/string_view.h"
32 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
33 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
34 #include "tensorflow/stream_executor/cuda/cuda_event.h"
35 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
36 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
37 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
38 #include "tensorflow/stream_executor/kernel_cache_config.h"
39 #include "tensorflow/stream_executor/lib/env.h"
40 #include "tensorflow/stream_executor/lib/error.h"
41 #include "tensorflow/stream_executor/lib/initialize.h"
42 #include "tensorflow/stream_executor/lib/mathutil.h"
43 #include "tensorflow/stream_executor/lib/numbers.h"
44 #include "tensorflow/stream_executor/lib/path.h"
45 #include "tensorflow/stream_executor/lib/process_state.h"
46 #include "tensorflow/stream_executor/lib/statusor.h"
47 #include "tensorflow/stream_executor/platform.h"
48 #include "tensorflow/stream_executor/platform/logging.h"
49 #include "tensorflow/stream_executor/platform/port.h"
50 #include "tensorflow/stream_executor/plugin_registry.h"
51 #include "tensorflow/stream_executor/stream.h"
52 #include "tensorflow/stream_executor/stream_executor_internal.h"
53 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
54 #include "tensorflow/stream_executor/timer.h"
55 
56 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
57 // always unwanted. This happens on Windows that defines such a macro.
58 #undef ERROR
59 
60 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
61 #error \
62     "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
63 #endif
64 
65 #ifdef __CUDA_RUNTIME_H__
66 #error \
67     "CUDA runtime being included into CUDA GPU executor; should be driver only."
68 #endif
69 
70 extern bool FLAGS_check_gpu_leaks;
71 bool FLAGS_prefer_cubin_to_ptx = true;
72 
73 namespace stream_executor {
74 namespace gpu {
75 
76 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
77 // It has been observed that loading both PTX and cubins into the driver library
78 // can cause it to crash, but loading only CUBINs avoids those crashes;
79 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
80 // PTX code.
81 //
82 // As this is an implementation-detail workaround, the usage is to declare this
83 // variable with extern linkage and populate it from another translation unit.
84 std::function<std::string(const std::string&)> g_cubinate;
85 
AsGpuEvent(Event * event)86 static GpuEvent* AsGpuEvent(Event* event) {
87   DCHECK(event != nullptr);
88   return static_cast<GpuEvent*>(event->implementation());
89 }
90 
91 // Given a platform-independent timer datatype, returns the internal CUDA
92 // platform implementation pointer.
AsGpuTimer(Timer * timer)93 static GpuTimer* AsGpuTimer(Timer* timer) {
94   DCHECK(timer != nullptr);
95   return static_cast<GpuTimer*>(timer->implementation());
96 }
97 
98 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
99 // for passing directly to libcuda APIs.
100 //
101 // N.B. we must lose constness in order to pass a suitable type to the existing
102 // libcuda APIs, so the caller should take care to only pass the result of const
103 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)104 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
105   return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
106 }
107 
108 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)109 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
110   return AsCudaDevicePtr(*gpu_mem);
111 }
112 
ExtractGpuContext(GpuExecutor * cuda_exec)113 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
114   CHECK(cuda_exec != nullptr);
115   return cuda_exec->gpu_context();
116 }
117 
ExtractGpuExecutor(StreamExecutor * stream_exec)118 GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
119   return static_cast<GpuExecutor*>(stream_exec->implementation());
120 }
121 
~GpuExecutor()122 GpuExecutor::~GpuExecutor() {
123   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
124   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
125   if (context_ != nullptr) {
126     GpuDriver::DestroyContext(context_);
127   }
128 }
129 
Init(int device_ordinal,DeviceOptions device_options)130 port::Status GpuExecutor::Init(int device_ordinal,
131                                DeviceOptions device_options) {
132   device_ordinal_ = device_ordinal;
133 
134   auto status = GpuDriver::Init();
135   if (!status.ok()) {
136     return status;
137   }
138 
139   status = GpuDriver::GetDevice(device_ordinal_, &device_);
140   if (!status.ok()) {
141     return status;
142   }
143 
144   status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
145                                     &context_);
146   if (!status.ok()) {
147     return status;
148   }
149 
150   return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
151 }
152 
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const153 bool GpuExecutor::FindOnDiskForComputeCapability(
154     absl::string_view filename, absl::string_view canonical_suffix,
155     std::string* found_filename) const {
156   if (cc_major_ == 0 && cc_minor_ == 0) {
157     return false;
158   }
159 
160   std::string cc_specific =
161       absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
162   if (port::FileExists(cc_specific).ok()) {
163     VLOG(2) << "found compute-capability-specific file, using that: "
164             << cc_specific;
165     *found_filename = cc_specific;
166     return true;
167   }
168 
169   VLOG(2) << "could not find compute-capability specific file at: "
170           << cc_specific;
171   if (port::FileExists(std::string(filename)).ok()) {
172     *found_filename = std::string(filename);
173     return true;
174   }
175 
176   return false;
177 }
178 
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const179 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
180                                           absl::string_view canonical_suffix,
181                                           std::string* found_filename) const {
182   LOG(ERROR)
183       << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
184   return false;
185 }
186 // Returns the path to the running executable.
187 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
188 // Arg: strip_exe: if true, remove the name of the executable itself from the
189 //                 returned string. Example: calling this from /usr/bin/foo
190 //                 would return /usr/bin.
GetBinaryDir(bool strip_exe)191 static std::string GetBinaryDir(bool strip_exe) {
192   char exe_path[PATH_MAX] = {0};
193 #if defined(__APPLE__)
194   uint32_t buffer_size = 0U;
195   _NSGetExecutablePath(nullptr, &buffer_size);
196   char unresolved_path[buffer_size];
197   _NSGetExecutablePath(unresolved_path, &buffer_size);
198   CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
199 #else
200 #if defined(PLATFORM_WINDOWS)
201   HMODULE hModule = GetModuleHandle(NULL);
202   GetModuleFileName(hModule, exe_path, MAX_PATH);
203 #else
204   PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
205 #endif
206 #endif
207   // Make sure it's null-terminated:
208   exe_path[sizeof(exe_path) - 1] = 0;
209 
210   if (strip_exe) {
211     // The exe is the last component of the path, so remove one component.
212     std::string ret = exe_path;
213     std::vector<std::string> components = absl::StrSplit(exe_path, '/');
214     components.pop_back();
215     return absl::StrJoin(components, "/");
216   }
217   return exe_path;
218 }
219 
LoadModuleFromCuBin(const char * cubin,CUmodule * module)220 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
221                                               CUmodule* module) {
222   uint64_t module_refcount;
223   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
224 
225   if (*module == nullptr) {
226     TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
227     module_refcount = 1;
228     VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
229             << " as module " << *module;
230   } else {
231     ++module_refcount;
232     VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
233             << " is already loaded as module " << *module;
234   }
235   gpu_binary_to_module_[cubin] = {*module, module_refcount};
236   return port::Status::OK();
237 }
238 
LoadModuleFromPtx(const char * ptx,CUmodule * module)239 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
240   uint64_t module_refcount;
241   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
242 
243   if (*module == nullptr) {
244     TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
245     VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
246             << *module;
247     module_refcount = 1;
248   } else {
249     ++module_refcount;
250     VLOG(3) << "PTX " << static_cast<const void*>(ptx)
251             << " is already loaded as module " << module;
252   }
253   gpu_binary_to_module_[ptx] = {*module, module_refcount};
254   return port::Status::OK();
255 }
256 
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)257 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
258                                               CUmodule* module) {
259   return port::InternalError(
260       "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
261 }
262 
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)263 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
264                                     KernelBase* kernel) {
265   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
266   CUmodule module;
267   const std::string* kernelname;
268 
269   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
270 
271   if (spec.has_cuda_cubin_in_memory()) {
272     absl::MutexLock lock{&in_memory_modules_mu_};
273     kernelname = &spec.cuda_cubin_in_memory().kernelname();
274     const char* cubin = spec.cuda_cubin_in_memory().bytes();
275     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
276     kernel_to_gpu_binary_[kernel] = cubin;
277   } else if (spec.has_cuda_ptx_in_memory()) {
278     kernelname = &spec.cuda_ptx_in_memory().kernelname();
279 
280     if (cc_major_ == 0 && cc_minor_ == 0) {
281       return port::InternalError("Compute capability not set");
282     }
283 
284     const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
285     if (ptx == nullptr) {
286       ptx = spec.cuda_ptx_in_memory().default_text();
287     }
288     if (ptx == nullptr) {
289       LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
290     }
291 
292     absl::MutexLock lock{&in_memory_modules_mu_};
293     TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
294     kernel_to_gpu_binary_[kernel] = ptx;
295   } else {
296     return port::InternalError("No method of loading CUDA kernel provided");
297   }
298   VLOG(2) << "getting function " << *kernelname << " from module " << module;
299   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
300                                     cuda_kernel->gpu_function_ptr())) {
301     return port::InternalError("Could not find the corresponding function");
302   }
303 
304   // We have to trust the kernel loader spec arity because there doesn't appear
305   // to be a way to reflect on the number of expected arguments w/the CUDA API.
306   cuda_kernel->set_arity(spec.arity());
307 
308   KernelMetadata kernel_metadata;
309   TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
310   kernel->set_metadata(kernel_metadata);
311   kernel->set_name(*kernelname);
312   return port::Status::OK();
313 }
314 
UnloadGpuBinary(const void * gpu_binary)315 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
316   auto module_it = gpu_binary_to_module_.find(gpu_binary);
317   if (gpu_binary_to_module_.end() == module_it) {
318     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
319     return false;
320   }
321   auto& module = module_it->second.first;
322   auto& refcount = module_it->second.second;
323   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
324   if (--refcount == 0) {
325     VLOG(3) << "Unloading CUDA module " << module;
326     GpuDriver::UnloadModule(context_, module);
327     gpu_binary_to_module_.erase(module_it);
328   }
329   return true;
330 }
331 
UnloadKernel(const KernelBase * kernel)332 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
333   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
334 
335   absl::MutexLock lock{&in_memory_modules_mu_};
336   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
337   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
338     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
339             << " has never been loaded.";
340     return;  // We've never seen this kernel.
341   }
342   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
343           << " has loaded GPU code " << gpu_binary_it->second;
344   UnloadGpuBinary(gpu_binary_it->second);
345   kernel_to_gpu_binary_.erase(gpu_binary_it);
346 }
347 
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)348 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
349                                      ModuleHandle* module_handle) {
350   // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
351   // ModuleHandle::id().
352   CUmodule cu_module;
353   if (spec.has_cuda_cubin_in_memory()) {
354     absl::MutexLock lock{&in_memory_modules_mu_};
355     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
356         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
357         &cu_module));
358     *module_handle = ModuleHandle(const_cast<void*>(
359         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
360     return port::Status::OK();
361   } else if (spec.has_cuda_ptx_in_memory()) {
362     if (cc_major_ == 0 && cc_minor_ == 0) {
363       return port::InternalError("Compute capability not set");
364     }
365 
366     if (!spec.cuda_ptx_in_memory()) {
367       return port::InternalError("PTX not found in spec");
368     }
369 
370     absl::MutexLock lock{&in_memory_modules_mu_};
371     TF_RETURN_IF_ERROR(
372         LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
373     *module_handle = ModuleHandle(
374         const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
375     return port::Status::OK();
376   }
377   return port::InternalError("No method of loading CUDA module provided");
378 }
379 
UnloadModule(ModuleHandle module_handle)380 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
381   const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
382   absl::MutexLock lock{&in_memory_modules_mu_};
383   return UnloadGpuBinary(gpu_binary);
384 }
385 
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)386 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
387                                             KernelMetadata* kernel_metadata) {
388   int value;
389   TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
390       CU_FUNC_ATTRIBUTE_NUM_REGS, *cuda_kernel->gpu_function_ptr(), &value));
391   kernel_metadata->set_registers_per_thread(value);
392 
393   TF_RETURN_IF_ERROR(
394       GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
395                                   *cuda_kernel->gpu_function_ptr(), &value));
396   kernel_metadata->set_shared_memory_bytes(value);
397   return port::Status::OK();
398 }
399 
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)400 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
401                                  const BlockDim& block_dims,
402                                  const KernelBase& kernel,
403                                  const KernelArgsArrayBase& args) {
404   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
405   CUstream custream = AsGpuStreamValue(stream);
406   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
407   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
408 
409   // Only perform/print the occupancy check once.  Even just checking to see
410   // whether we've done an occupancy check on this kernel before isn't free
411   // (because we have to synchronize), so we only do this at -v 2+.
412   if (VLOG_IS_ON(2)) {
413     absl::MutexLock lock(&launched_kernels_mu_);
414     if (!launched_kernels_.count(cufunc)) {
415       VlogOccupancyInfo(kernel, thread_dims, block_dims);
416       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
417       // expose a kernel/module deallocation method.
418       launched_kernels_.insert(cufunc);
419     }
420   }
421 
422   if (cuda_kernel->GetPreferredCacheConfig() !=
423       KernelCacheConfig::kNoPreference) {
424     TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
425         cufunc, cuda_kernel->GetGpuCacheConfig()));
426   }
427 
428   void** kernel_params = const_cast<void**>(args.argument_addresses().data());
429 
430   return GpuDriver::LaunchKernel(
431       context_, cufunc, block_dims.x, block_dims.y, block_dims.z, thread_dims.x,
432       thread_dims.y, thread_dims.z, args.number_of_shared_bytes(), custream,
433       kernel_params, nullptr /* = extra */);
434 }
435 
436 // This is a non-essential operation; if there's a failure, proceed without
437 // logging an error. It's nearly certain that in case of failures, we'd never
438 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)439 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
440                                     const ThreadDim& thread_dims,
441                                     const BlockDim& block_dims) {
442   VLOG(2) << "Computing kernel occupancy for kernel "
443           << kernel.demangled_name();
444   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
445           << ", " << thread_dims.z << ")";
446 
447   int regs_per_thread;
448   if (!kernel.metadata().registers_per_thread(&regs_per_thread)) {
449     return;
450   }
451 
452   int smem_per_block;
453   if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
454     return;
455   }
456 
457   const DeviceDescription& device_description =
458       kernel.parent()->GetDeviceDescription();
459 
460   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
461   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
462 
463   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
464                                          smem_per_block, thread_dims, cufunc);
465   VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
466 
467   int suggested_threads =
468       CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
469                        smem_per_block, thread_dims, cufunc);
470   if (suggested_threads != 0) {
471     VLOG(2) << "The cuda occupancy calculator recommends using "
472             << suggested_threads
473             << " threads per block to achieve an occupancy of " << blocks_per_sm
474             << " blocks per SM.";
475   }
476 }
477 
478 // Compute and return maximum blocks per core (occupancy) based on the
479 // device description, some kernel characteristics and the number of threads per
480 // block.  If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)481 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
482                                     uint64 registers_per_thread,
483                                     uint64 shared_memory_per_block,
484                                     const ThreadDim& thread_dims,
485                                     CUfunction func) {
486   int suggested_blocks = 0;
487   int suggested_threads = 0;
488   CUresult err = cuOccupancyMaxPotentialBlockSize(
489       &suggested_blocks, &suggested_threads, func, nullptr,
490       shared_memory_per_block, 0);
491   CHECK_EQ(err, CUDA_SUCCESS);
492   return suggested_blocks;
493 }
494 
495 // Compute and return the suggested thread count to achieve ideal occupancy.
496 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)497 int GpuExecutor::CompareOccupancy(int* initial_blocks,
498                                   const DeviceDescription& device_description,
499                                   uint64 registers_per_thread,
500                                   uint64 shared_memory_per_block,
501                                   const ThreadDim& thread_dims,
502                                   CUfunction func) {
503   int suggested_blocks = 0;
504   int suggested_threads = 0;
505   CUresult err = cuOccupancyMaxPotentialBlockSize(
506       &suggested_blocks, &suggested_threads, func, nullptr,
507       shared_memory_per_block, 0);
508   CHECK_EQ(err, CUDA_SUCCESS);
509   if (suggested_blocks > *initial_blocks) {
510     *initial_blocks = suggested_blocks;
511     return suggested_threads;
512   } else {
513     return 0;
514   }
515 }
516 
Allocate(uint64 size,int64 memory_space)517 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
518   CHECK_EQ(memory_space, 0);
519   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
520 }
521 
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)522 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
523                                 uint64 size_bytes) {
524   // offset and size are in bytes, so char* works as the pointer type.
525   return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
526 }
527 
Deallocate(DeviceMemoryBase * mem)528 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
529   GpuDriver::DeviceDeallocate(context_, mem->opaque());
530 }
531 
HostMemoryRegister(void * location,uint64 size)532 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
533   if (location == nullptr || size == 0) {
534     LOG(WARNING) << "attempting to register null or zero-sized memory: "
535                  << location << "; size " << size;
536   }
537   VLOG(2) << "registering " << location << " size " << size;
538   return GpuDriver::HostRegister(context_, location, size);
539 }
540 
HostMemoryUnregister(void * location)541 bool GpuExecutor::HostMemoryUnregister(void* location) {
542   VLOG(2) << "unregistering " << location;
543   return GpuDriver::HostUnregister(context_, location);
544 }
545 
SynchronizeAllActivity()546 bool GpuExecutor::SynchronizeAllActivity() {
547   return GpuDriver::SynchronizeContext(context_);
548 }
549 
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)550 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
551                                              uint64 size) {
552   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
553       size % 4 == 0) {
554     return GpuDriver::SynchronousMemsetUint32(
555         context_, AsCudaDevicePtr(location), 0x0, size / 4);
556   }
557   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
558                                            0x0, size);
559 }
560 
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)561 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
562                                             int value, uint64 size) {
563   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
564       size % 4 == 0) {
565     // cudaMemset reinterprets "value" as a uint8.
566     uint8 byte_value = static_cast<uint8>(value);
567     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
568                      (byte_value << 8) | byte_value;
569     return GpuDriver::SynchronousMemsetUint32(
570         context_, AsCudaDevicePtr(location), pattern, size / 4);
571   }
572   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
573                                            value, size);
574 }
575 
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)576 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
577                                             const void* host_src, uint64 size) {
578   return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
579                                          host_src, size);
580 }
581 
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)582 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
583                                             const DeviceMemoryBase& gpu_src,
584                                             uint64 size) {
585   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
586                                          AsCudaDevicePtr(gpu_src), size);
587 }
588 
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)589 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
590     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
591   return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
592                                          AsCudaDevicePtr(gpu_src), size);
593 }
594 
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)595 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
596                                   uint64 size) {
597   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
598       size % 4 == 0) {
599     return Memset32(stream, location, 0x0, size);
600   } else {
601     return Memset(stream, location, 0x0, size);
602   }
603 }
604 
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)605 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
606                                  uint8 pattern, uint64 size) {
607   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
608           << " at location " << location << " with size " << size
609           << " and pattern " << std::hex << pattern;
610   return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
611                                             pattern, size,
612                                             AsGpuStreamValue(stream));
613 }
614 
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)615 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
616                                    uint32 pattern, uint64 size) {
617   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
618           << " at location " << location << " with size " << size
619           << " and pattern " << std::hex << pattern;
620   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
621         size % 4 == 0);
622   return GpuDriver::AsynchronousMemsetUint32(
623       context_, AsCudaDevicePtr(location), pattern, size / 4,
624       AsGpuStreamValue(stream));
625 }
626 
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)627 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
628                          const DeviceMemoryBase& gpu_src, uint64 size) {
629   return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
630                                           AsCudaDevicePtr(gpu_src), size,
631                                           AsGpuStreamValue(stream));
632 }
633 
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)634 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
635                          const void* host_src, uint64 size) {
636   return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
637                                           host_src, size,
638                                           AsGpuStreamValue(stream));
639 }
640 
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)641 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
642                                        DeviceMemoryBase* gpu_dst,
643                                        const DeviceMemoryBase& gpu_src,
644                                        uint64 size) {
645   return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
646                                           AsCudaDevicePtr(gpu_src), size,
647                                           AsGpuStreamValue(stream));
648 }
649 
HostCallback(Stream * stream,std::function<port::Status ()> callback)650 bool GpuExecutor::HostCallback(Stream* stream,
651                                std::function<port::Status()> callback) {
652   auto callback_ptr = new std::function<void()>([callback]() {
653     port::Status s = callback();
654     if (!s.ok()) {
655       LOG(WARNING) << "Host callback failed: " << s;
656     }
657   });
658   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
659                                       InternalHostCallback, callback_ptr);
660 }
661 
InternalHostCallback(CUstream stream,CUresult status,void * data)662 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
663                                                     CUresult status,
664                                                     void* data) {
665   std::function<void()>* callback =
666       reinterpret_cast<std::function<void()>*>(data);
667   (*callback)();
668   delete callback;
669 }
670 
AllocateEvent(Event * event)671 port::Status GpuExecutor::AllocateEvent(Event* event) {
672   return AsGpuEvent(event)->Init();
673 }
674 
DeallocateEvent(Event * event)675 port::Status GpuExecutor::DeallocateEvent(Event* event) {
676   return AsGpuEvent(event)->Destroy();
677 }
678 
RecordEvent(Stream * stream,Event * event)679 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
680   return AsGpuEvent(event)->Record(AsGpuStream(stream));
681 }
682 
WaitForEvent(Stream * stream,Event * event)683 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
684   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
685                                    AsGpuEvent(event)->gpu_event())) {
686     return port::Status::OK();
687   } else {
688     return port::Status(
689         port::error::INTERNAL,
690         absl::StrFormat("error recording waiting for CUDA event on stream %p",
691                         stream));
692   }
693 }
694 
PollForEventStatus(Event * event)695 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
696   return AsGpuEvent(event)->PollForStatus();
697 }
698 
AllocateStream(Stream * stream)699 bool GpuExecutor::AllocateStream(Stream* stream) {
700   return AsGpuStream(stream)->Init();
701 }
702 
DeallocateStream(Stream * stream)703 void GpuExecutor::DeallocateStream(Stream* stream) {
704   GpuStream* cuda_stream = AsGpuStream(stream);
705   if (!cuda_stream->IsIdle()) {
706     LOG(ERROR) << "Deallocating stream with pending work";
707   }
708   cuda_stream->Destroy();
709 }
710 
AllocateTimer(Timer * timer)711 bool GpuExecutor::AllocateTimer(Timer* timer) {
712   return AsGpuTimer(timer)->Init();
713 }
714 
DeallocateTimer(Timer * timer)715 void GpuExecutor::DeallocateTimer(Timer* timer) {
716   AsGpuTimer(timer)->Destroy();
717 }
718 
CreateStreamDependency(Stream * dependent,Stream * other)719 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
720   CUevent other_completed_event = *AsGpuStream(other)->completed_event();
721   bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
722                                    AsGpuStreamValue(other))
723                 .ok();
724   if (!ok) {
725     LOG(ERROR) << "failed to record completion event; "
726                   "therefore, failed to create inter-stream dependency";
727     return false;
728   }
729 
730   return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
731                                       other_completed_event);
732 }
733 
StartTimer(Stream * stream,Timer * timer)734 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
735   return AsGpuTimer(timer)->Start(AsGpuStream(stream));
736 }
737 
StopTimer(Stream * stream,Timer * timer)738 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
739   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
740 }
741 
BlockHostUntilDone(Stream * stream)742 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
743   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
744 }
745 
CreateBlas()746 blas::BlasSupport* GpuExecutor::CreateBlas() {
747   PluginRegistry* registry = PluginRegistry::Instance();
748   port::StatusOr<PluginRegistry::BlasFactory> status =
749       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
750                                                         plugin_config_.blas());
751   if (!status.ok()) {
752     LOG(ERROR) << "Unable to retrieve BLAS factory: "
753                << status.status().error_message();
754     return nullptr;
755   }
756 
757   return status.ValueOrDie()(this);
758 }
759 
CreateDnn()760 dnn::DnnSupport* GpuExecutor::CreateDnn() {
761   PluginRegistry* registry = PluginRegistry::Instance();
762   port::StatusOr<PluginRegistry::DnnFactory> status =
763       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
764                                                        plugin_config_.dnn());
765   if (!status.ok()) {
766     LOG(ERROR) << "Unable to retrieve DNN factory: "
767                << status.status().error_message();
768     return nullptr;
769   }
770 
771   return status.ValueOrDie()(this);
772 }
773 
CreateFft()774 fft::FftSupport* GpuExecutor::CreateFft() {
775   PluginRegistry* registry = PluginRegistry::Instance();
776   port::StatusOr<PluginRegistry::FftFactory> status =
777       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
778                                                        plugin_config_.fft());
779   if (!status.ok()) {
780     LOG(ERROR) << "Unable to retrieve FFT factory: "
781                << status.status().error_message();
782     return nullptr;
783   }
784 
785   return status.ValueOrDie()(this);
786 }
787 
CreateRng()788 rng::RngSupport* GpuExecutor::CreateRng() {
789   PluginRegistry* registry = PluginRegistry::Instance();
790   port::StatusOr<PluginRegistry::RngFactory> status =
791       registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
792                                                        plugin_config_.rng());
793   if (!status.ok()) {
794     LOG(ERROR) << "Unable to retrieve RNG factory: "
795                << status.status().error_message();
796     return nullptr;
797   }
798 
799   return status.ValueOrDie()(this);
800 }
801 
802 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const803 bool GpuExecutor::SupportsDnn() const { return true; }
804 
CanEnablePeerAccessTo(StreamExecutorInterface * other)805 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
806   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
807   return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
808 }
809 
EnablePeerAccessTo(StreamExecutorInterface * other)810 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
811   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
812   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
813 }
814 
DeviceMemoryUsage(int64 * free,int64 * total) const815 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
816   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
817 }
818 
GetSymbol(const std::string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)819 bool GpuExecutor::GetSymbol(const std::string& symbol_name,
820                             ModuleHandle module_handle, void** mem,
821                             size_t* bytes) {
822   auto lookup_in_module = [&](CUmodule module) {
823     CHECK(module != nullptr);
824     return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
825                                       reinterpret_cast<CUdeviceptr*>(mem),
826                                       bytes);
827   };
828 
829   {  // give limited scope to mutex_lock
830     absl::MutexLock lock{&in_memory_modules_mu_};
831     if (static_cast<bool>(module_handle)) {
832       auto it = gpu_binary_to_module_.find(module_handle.id());
833       CHECK(it != gpu_binary_to_module_.end());
834       return lookup_in_module(it->second.first);
835     }
836 
837     for (auto& it : gpu_binary_to_module_) {
838       if (lookup_in_module(it.second.first)) {
839         return true;
840       }
841     }
842   }
843 
844   LOG(INFO) << "Failed to find symbol in any modules: " << symbol_name;
845   return false;
846 }
847 
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)848 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
849   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
850   // we use BlockDims to express the dimensions of blocks within a grid
851   // (as opposed to ThreadDim which expresses the dimensions of threads
852   // within a block).
853   int x, y, z;
854   if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
855     return false;
856   }
857 
858   block_dim_limit->x = x;
859   block_dim_limit->y = y;
860   block_dim_limit->z = z;
861   return true;
862 }
863 
SupportsBlas() const864 bool GpuExecutor::SupportsBlas() const { return true; }
865 
SupportsFft() const866 bool GpuExecutor::SupportsFft() const { return true; }
867 
SupportsRng() const868 bool GpuExecutor::SupportsRng() const { return true; }
869 
870 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()871 GpuExecutor::CreateEventImplementation() {
872   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
873 }
874 
875 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()876 GpuExecutor::CreateKernelImplementation() {
877   return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
878 }
879 
880 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()881 GpuExecutor::GetStreamImplementation() {
882   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
883 }
884 
885 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()886 GpuExecutor::GetTimerImplementation() {
887   return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
888 }
889 
GpuContextHack()890 void* GpuExecutor::GpuContextHack() { return context_; }
891 
gpu_context()892 GpuContext* GpuExecutor::gpu_context() { return context_; }
893 
894 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
895 // of SysFS. Returns -1 if it cannot.
896 //
897 // For anything more complicated/prod-focused than this, you'll likely want to
898 // turn to gsys' topology modeling.
TryToReadNumaNode(const std::string & pci_bus_id,int device_ordinal)899 static int TryToReadNumaNode(const std::string& pci_bus_id,
900                              int device_ordinal) {
901 #if defined(__APPLE__)
902   LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
903   return 0;
904 #elif defined(PLATFORM_WINDOWS)
905   // Windows support for NUMA is not currently implemented. Return node 0.
906   return 0;
907 #elif defined(__aarch64__)
908   LOG(INFO) << "ARM64 does not support NUMA - returning NUMA node zero";
909   return 0;
910 #else
911   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
912   static const int kUnknownNumaNode = -1;
913 
914   if (pci_bus_id.empty()) {
915     LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
916     return kUnknownNumaNode;
917   }
918 
919   std::string filename =
920       absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
921 
922   // We have to use fopen/fread here so that the device properties can be
923   // populated before InitGoogle procedure has been completed (at which point we
924   // could use the file::* utilities).
925   FILE* file = fopen(filename.c_str(), "r");
926   if (file == nullptr) {
927     LOG(INFO) << "could not open file to read NUMA node: " << filename
928               << "\nYour kernel may have been built without NUMA support.";
929     return kUnknownNumaNode;
930   }
931 
932   std::string content;
933   char buf[32];
934   size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
935   buf[did_read] = '\0';
936   content = buf;
937 
938   int32 value;
939   if (port::safe_strto32(content, &value)) {
940     if (value < 0) {  // See http://b/18228951 for details on this path.
941       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
942                 << value
943                 << "), but there must be at least one NUMA node"
944                    ", so returning NUMA node zero";
945       fclose(file);
946       return 0;
947     }
948     fclose(file);
949     return value;
950   }
951 
952   LOG(WARNING)
953       << "could not convert SysFS file contents to integral NUMA node value: "
954       << content;
955 
956   fclose(file);
957   return kUnknownNumaNode;
958 #endif
959 }
960 
961 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)962 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
963   GpuDeviceHandle device;
964   auto status = GpuDriver::GetDevice(device_ordinal, &device);
965   if (!status.ok()) {
966     return status;
967   }
968 
969   int cc_major;
970   int cc_minor;
971   status = GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device);
972   if (!status.ok()) {
973     return status;
974   }
975 
976   internal::DeviceDescriptionBuilder builder;
977 
978   {
979     int driver_version = 0;
980     (void)GpuDriver::GetDriverVersion(&driver_version);
981     std::string augmented_driver_version = absl::StrFormat(
982         "%d (%s)", driver_version,
983         cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion()));
984     builder.set_driver_version(augmented_driver_version);
985   }
986 
987   {
988     std::string pci_bus_id = GpuDriver::GetPCIBusID(device);
989 
990     // Lower the hex characters to match sysfs.
991     pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
992     builder.set_pci_bus_id(pci_bus_id);
993 
994     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
995     int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
996     builder.set_numa_node(numa_node);
997   }
998 
999   {
1000     builder.set_threads_per_block_limit(
1001         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1002                                       device)
1003             .ValueOrDie());
1004 
1005     ThreadDim thread_dim_limit;
1006     thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1007                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device)
1008                              .ValueOrDie();
1009     thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1010                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device)
1011                              .ValueOrDie();
1012     thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1013                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
1014                              .ValueOrDie();
1015     builder.set_thread_dim_limit(thread_dim_limit);
1016 
1017     int clock_rate =
1018         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
1019             .ValueOrDie();
1020     builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1021   }
1022 
1023   {
1024     bool ecc_enabled = false;
1025     (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
1026     builder.set_ecc_enabled(ecc_enabled);
1027   }
1028 
1029   {
1030     uint64 device_memory_size = -1;
1031     (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
1032     builder.set_device_memory_size(device_memory_size);
1033   }
1034 
1035   port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1036       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
1037   port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1038       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
1039   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1040     // Times 2 because HBM is DDR memory; it gets two data bits per each data
1041     // lane.
1042     builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1043                                  1000 *
1044                                  int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1045   }
1046 
1047   {
1048     BlockDim block_dim_limit;
1049     FillBlockDimLimit(device, &block_dim_limit);
1050     builder.set_block_dim_limit(block_dim_limit);
1051   }
1052 
1053   {
1054     std::string device_name;
1055     TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
1056     builder.set_name(device_name);
1057   }
1058 
1059   builder.set_platform_version(
1060       absl::StrCat("Compute Capability ", cc_major, ".", cc_minor));
1061 
1062   // TODO(leary) should be a way to query this from the driver, but this is
1063   // unlikely to change for us any time soon.
1064   builder.set_device_address_bits(64);
1065 
1066   builder.set_device_vendor("NVIDIA Corporation");
1067   builder.set_cuda_compute_capability(cc_major, cc_minor);
1068   builder.set_shared_memory_per_core(
1069       GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
1070   builder.set_shared_memory_per_block(
1071       GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
1072   builder.set_core_count(
1073       GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
1074   builder.set_threads_per_core_limit(
1075       GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
1076   builder.set_registers_per_block_limit(
1077       GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
1078   builder.set_threads_per_warp(
1079       GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
1080   builder.set_registers_per_core_limit(
1081       GpuDriver::GetDeviceAttribute(
1082           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
1083           .ValueOrDie());
1084 
1085   return builder.Build();
1086 }
1087 
1088 }  // namespace gpu
1089 
1090 }  // namespace stream_executor
1091 
1092 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
1093