• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
17 
18 #if defined(__APPLE__)
19 #include <mach-o/dyld.h>
20 #endif
21 #if defined(PLATFORM_WINDOWS)
22 #include <windows.h>
23 #define PATH_MAX MAX_PATH
24 #else
25 #include <unistd.h>
26 #endif
27 #include "absl/strings/ascii.h"
28 #include "absl/strings/str_cat.h"
29 #include "absl/strings/str_format.h"
30 #include "absl/strings/str_split.h"
31 #include "absl/strings/string_view.h"
32 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
33 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
34 #include "tensorflow/stream_executor/cuda/cuda_event.h"
35 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
36 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
37 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
38 #include "tensorflow/stream_executor/kernel_cache_config.h"
39 #include "tensorflow/stream_executor/lib/env.h"
40 #include "tensorflow/stream_executor/lib/error.h"
41 #include "tensorflow/stream_executor/lib/initialize.h"
42 #include "tensorflow/stream_executor/lib/mathutil.h"
43 #include "tensorflow/stream_executor/lib/numbers.h"
44 #include "tensorflow/stream_executor/lib/path.h"
45 #include "tensorflow/stream_executor/lib/process_state.h"
46 #include "tensorflow/stream_executor/lib/statusor.h"
47 #include "tensorflow/stream_executor/platform.h"
48 #include "tensorflow/stream_executor/platform/logging.h"
49 #include "tensorflow/stream_executor/platform/port.h"
50 #include "tensorflow/stream_executor/plugin_registry.h"
51 #include "tensorflow/stream_executor/stream.h"
52 #include "tensorflow/stream_executor/stream_executor_internal.h"
53 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
54 #include "tensorflow/stream_executor/timer.h"
55 
56 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
57 // always unwanted. This happens on Windows that defines such a macro.
58 #undef ERROR
59 
60 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
61 #error \
62     "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
63 #endif
64 
65 #ifdef __CUDA_RUNTIME_H__
66 #error \
67     "CUDA runtime being included into CUDA GPU executor; should be driver only."
68 #endif
69 
70 extern bool FLAGS_check_gpu_leaks;
71 bool FLAGS_prefer_cubin_to_ptx = true;
72 
73 namespace stream_executor {
74 namespace gpu {
75 
76 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
77 // It has been observed that loading both PTX and cubins into the driver library
78 // can cause it to crash, but loading only CUBINs avoids those crashes;
79 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
80 // PTX code.
81 //
82 // As this is an implementation-detail workaround, the usage is to declare this
83 // variable with extern linkage and populate it from another translation unit.
84 std::function<std::string(const std::string&)> g_cubinate;
85 
AsGpuEvent(Event * event)86 static GpuEvent* AsGpuEvent(Event* event) {
87   DCHECK(event != nullptr);
88   return static_cast<GpuEvent*>(event->implementation());
89 }
90 
91 // Given a platform-independent timer datatype, returns the internal CUDA
92 // platform implementation pointer.
AsGpuTimer(Timer * timer)93 static GpuTimer* AsGpuTimer(Timer* timer) {
94   DCHECK(timer != nullptr);
95   return static_cast<GpuTimer*>(timer->implementation());
96 }
97 
98 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
99 // for passing directly to libcuda APIs.
100 //
101 // N.B. we must lose constness in order to pass a suitable type to the existing
102 // libcuda APIs, so the caller should take care to only pass the result of const
103 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)104 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
105   return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
106 }
107 
108 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)109 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
110   return AsCudaDevicePtr(*gpu_mem);
111 }
112 
ExtractGpuContext(GpuExecutor * cuda_exec)113 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
114   CHECK(cuda_exec != nullptr);
115   return cuda_exec->gpu_context();
116 }
117 
~GpuExecutor()118 GpuExecutor::~GpuExecutor() {
119   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
120   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
121   if (context_ != nullptr) {
122     GpuDriver::DestroyContext(context_);
123   }
124 }
125 
Init(int device_ordinal,DeviceOptions device_options)126 port::Status GpuExecutor::Init(int device_ordinal,
127                                DeviceOptions device_options) {
128   device_ordinal_ = device_ordinal;
129 
130   auto status = GpuDriver::Init();
131   if (!status.ok()) {
132     return status;
133   }
134 
135   status = GpuDriver::GetDevice(device_ordinal_, &device_);
136   if (!status.ok()) {
137     return status;
138   }
139 
140   status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
141                                     &context_);
142   if (!status.ok()) {
143     return status;
144   }
145 
146   return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
147 }
148 
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const149 bool GpuExecutor::FindOnDiskForComputeCapability(
150     absl::string_view filename, absl::string_view canonical_suffix,
151     std::string* found_filename) const {
152   if (cc_major_ == 0 && cc_minor_ == 0) {
153     return false;
154   }
155 
156   std::string cc_specific =
157       absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
158   if (port::FileExists(cc_specific).ok()) {
159     VLOG(2) << "found compute-capability-specific file, using that: "
160             << cc_specific;
161     *found_filename = cc_specific;
162     return true;
163   }
164 
165   VLOG(2) << "could not find compute-capability specific file at: "
166           << cc_specific;
167   if (port::FileExists(std::string(filename)).ok()) {
168     *found_filename = std::string(filename);
169     return true;
170   }
171 
172   return false;
173 }
174 
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const175 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
176                                           absl::string_view canonical_suffix,
177                                           std::string* found_filename) const {
178   LOG(ERROR)
179       << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
180   return false;
181 }
182 // Returns the path to the running executable.
183 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
184 // Arg: strip_exe: if true, remove the name of the executable itself from the
185 //                 returned string. Example: calling this from /usr/bin/foo
186 //                 would return /usr/bin.
GetBinaryDir(bool strip_exe)187 static std::string GetBinaryDir(bool strip_exe) {
188   char exe_path[PATH_MAX] = {0};
189 #if defined(__APPLE__)
190   uint32_t buffer_size = 0U;
191   _NSGetExecutablePath(nullptr, &buffer_size);
192   char unresolved_path[buffer_size];
193   _NSGetExecutablePath(unresolved_path, &buffer_size);
194   CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
195 #else
196 #if defined(PLATFORM_WINDOWS)
197   HMODULE hModule = GetModuleHandle(NULL);
198   GetModuleFileName(hModule, exe_path, MAX_PATH);
199 #else
200   PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
201 #endif
202 #endif
203   // Make sure it's null-terminated:
204   exe_path[sizeof(exe_path) - 1] = 0;
205 
206   if (strip_exe) {
207     // The exe is the last component of the path, so remove one component.
208     std::string ret = exe_path;
209     std::vector<std::string> components = absl::StrSplit(exe_path, '/');
210     components.pop_back();
211     return absl::StrJoin(components, "/");
212   }
213   return exe_path;
214 }
215 
LoadModuleFromCuBin(const char * cubin,CUmodule * module)216 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
217                                               CUmodule* module) {
218   uint64_t module_refcount;
219   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
220 
221   if (*module == nullptr) {
222     TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
223     module_refcount = 1;
224     VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
225             << " as module " << *module;
226   } else {
227     ++module_refcount;
228     VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
229             << " is already loaded as module " << *module;
230   }
231   gpu_binary_to_module_[cubin] = {*module, module_refcount};
232   return port::Status::OK();
233 }
234 
LoadModuleFromPtx(const char * ptx,CUmodule * module)235 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
236   uint64_t module_refcount;
237   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
238 
239   if (*module == nullptr) {
240     TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
241     VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
242             << *module;
243     module_refcount = 1;
244   } else {
245     ++module_refcount;
246     VLOG(3) << "PTX " << static_cast<const void*>(ptx)
247             << " is already loaded as module " << module;
248   }
249   gpu_binary_to_module_[ptx] = {*module, module_refcount};
250   return port::Status::OK();
251 }
252 
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)253 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
254                                               CUmodule* module) {
255   return port::InternalError(
256       "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
257 }
258 
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)259 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
260                                     KernelBase* kernel) {
261   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
262   CUmodule module;
263   const std::string* kernelname;
264 
265   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
266 
267   if (spec.has_cuda_cubin_in_memory()) {
268     absl::MutexLock lock{&in_memory_modules_mu_};
269     kernelname = &spec.cuda_cubin_in_memory().kernelname();
270     const char* cubin = spec.cuda_cubin_in_memory().bytes();
271     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
272     kernel_to_gpu_binary_[kernel] = cubin;
273   } else if (spec.has_cuda_ptx_in_memory()) {
274     kernelname = &spec.cuda_ptx_in_memory().kernelname();
275 
276     if (cc_major_ == 0 && cc_minor_ == 0) {
277       return port::InternalError("Compute capability not set");
278     }
279 
280     const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
281     if (ptx == nullptr) {
282       ptx = spec.cuda_ptx_in_memory().default_text();
283     }
284     if (ptx == nullptr) {
285       LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
286     }
287 
288     absl::MutexLock lock{&in_memory_modules_mu_};
289     TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
290     kernel_to_gpu_binary_[kernel] = ptx;
291   } else {
292     return port::InternalError("No method of loading CUDA kernel provided");
293   }
294   VLOG(2) << "getting function " << *kernelname << " from module " << module;
295   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
296                                     cuda_kernel->gpu_function_ptr())) {
297     return port::InternalError("Could not find the corresponding function");
298   }
299 
300   // We have to trust the kernel loader spec arity because there doesn't appear
301   // to be a way to reflect on the number of expected arguments w/the CUDA API.
302   cuda_kernel->set_arity(spec.arity());
303 
304   KernelMetadata kernel_metadata;
305   TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
306   kernel->set_metadata(kernel_metadata);
307   kernel->set_name(*kernelname);
308   return port::Status::OK();
309 }
310 
UnloadGpuBinary(const void * gpu_binary)311 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
312   auto module_it = gpu_binary_to_module_.find(gpu_binary);
313   if (gpu_binary_to_module_.end() == module_it) {
314     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
315     return false;
316   }
317   auto& module = module_it->second.first;
318   auto& refcount = module_it->second.second;
319   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
320   if (--refcount == 0) {
321     VLOG(3) << "Unloading CUDA module " << module;
322     GpuDriver::UnloadModule(context_, module);
323     gpu_binary_to_module_.erase(module_it);
324   }
325   return true;
326 }
327 
UnloadKernel(const KernelBase * kernel)328 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
329   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
330 
331   absl::MutexLock lock{&in_memory_modules_mu_};
332   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
333   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
334     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
335             << " has never been loaded.";
336     return;  // We've never seen this kernel.
337   }
338   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
339           << " has loaded GPU code " << gpu_binary_it->second;
340   UnloadGpuBinary(gpu_binary_it->second);
341   kernel_to_gpu_binary_.erase(gpu_binary_it);
342 }
343 
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)344 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
345                                      ModuleHandle* module_handle) {
346   // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
347   // ModuleHandle::id().
348   CUmodule cu_module;
349   if (spec.has_cuda_cubin_in_memory()) {
350     absl::MutexLock lock{&in_memory_modules_mu_};
351     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
352         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
353         &cu_module));
354     *module_handle = ModuleHandle(const_cast<void*>(
355         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
356     return port::Status::OK();
357   } else if (spec.has_cuda_ptx_in_memory()) {
358     if (cc_major_ == 0 && cc_minor_ == 0) {
359       return port::InternalError("Compute capability not set");
360     }
361 
362     if (!spec.cuda_ptx_in_memory()) {
363       return port::InternalError("PTX not found in spec");
364     }
365 
366     absl::MutexLock lock{&in_memory_modules_mu_};
367     TF_RETURN_IF_ERROR(
368         LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
369     *module_handle = ModuleHandle(
370         const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
371     return port::Status::OK();
372   }
373   return port::InternalError("No method of loading CUDA module provided");
374 }
375 
UnloadModule(ModuleHandle module_handle)376 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
377   const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
378   absl::MutexLock lock{&in_memory_modules_mu_};
379   return UnloadGpuBinary(gpu_binary);
380 }
381 
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)382 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
383                                             KernelMetadata* kernel_metadata) {
384   int value;
385   TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
386       CU_FUNC_ATTRIBUTE_NUM_REGS, *cuda_kernel->gpu_function_ptr(), &value));
387   kernel_metadata->set_registers_per_thread(value);
388 
389   TF_RETURN_IF_ERROR(
390       GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
391                                   *cuda_kernel->gpu_function_ptr(), &value));
392   kernel_metadata->set_shared_memory_bytes(value);
393   return port::Status::OK();
394 }
395 
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)396 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
397                                  const BlockDim& block_dims,
398                                  const KernelBase& kernel,
399                                  const KernelArgsArrayBase& args) {
400   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
401   CUstream custream = AsGpuStreamValue(stream);
402   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
403   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
404 
405   // Only perform/print the occupancy check once.  Even just checking to see
406   // whether we've done an occupancy check on this kernel before isn't free
407   // (because we have to synchronize), so we only do this at -v 2+.
408   if (VLOG_IS_ON(2)) {
409     absl::MutexLock lock(&launched_kernels_mu_);
410     if (!launched_kernels_.count(cufunc)) {
411       VlogOccupancyInfo(kernel, thread_dims, block_dims);
412       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
413       // expose a kernel/module deallocation method.
414       launched_kernels_.insert(cufunc);
415     }
416   }
417 
418   if (cuda_kernel->GetPreferredCacheConfig() !=
419       KernelCacheConfig::kNoPreference) {
420     TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
421         cufunc, cuda_kernel->GetGpuCacheConfig()));
422   }
423 
424   void** kernel_params = const_cast<void**>(args.argument_addresses().data());
425 
426   return GpuDriver::LaunchKernel(context_, kernel.name(), cufunc, block_dims.x,
427                                  block_dims.y, block_dims.z, thread_dims.x,
428                                  thread_dims.y, thread_dims.z,
429                                  args.number_of_shared_bytes(), custream,
430                                  kernel_params, nullptr /* = extra */);
431 }
432 
433 // This is a non-essential operation; if there's a failure, proceed without
434 // logging an error. It's nearly certain that in case of failures, we'd never
435 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)436 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
437                                     const ThreadDim& thread_dims,
438                                     const BlockDim& block_dims) {
439   VLOG(2) << "Computing kernel occupancy for kernel "
440           << kernel.demangled_name();
441   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
442           << ", " << thread_dims.z << ")";
443 
444   int regs_per_thread;
445   if (!kernel.metadata().registers_per_thread(&regs_per_thread)) {
446     return;
447   }
448 
449   int smem_per_block;
450   if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
451     return;
452   }
453 
454   const DeviceDescription& device_description =
455       kernel.parent()->GetDeviceDescription();
456 
457   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
458   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
459 
460   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
461                                          smem_per_block, thread_dims, cufunc);
462   VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
463 
464   int suggested_threads =
465       CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
466                        smem_per_block, thread_dims, cufunc);
467   if (suggested_threads != 0) {
468     VLOG(2) << "The cuda occupancy calculator recommends using "
469             << suggested_threads
470             << " threads per block to achieve an occupancy of " << blocks_per_sm
471             << " blocks per SM.";
472   }
473 }
474 
475 // Compute and return maximum blocks per core (occupancy) based on the
476 // device description, some kernel characteristics and the number of threads per
477 // block.  If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)478 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
479                                     uint64 registers_per_thread,
480                                     uint64 shared_memory_per_block,
481                                     const ThreadDim& thread_dims,
482                                     CUfunction func) {
483   int suggested_blocks = 0;
484   int suggested_threads = 0;
485   CUresult err = cuOccupancyMaxPotentialBlockSize(
486       &suggested_blocks, &suggested_threads, func, nullptr,
487       shared_memory_per_block, 0);
488   CHECK_EQ(err, CUDA_SUCCESS);
489   return suggested_blocks;
490 }
491 
492 // Compute and return the suggested thread count to achieve ideal occupancy.
493 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)494 int GpuExecutor::CompareOccupancy(int* initial_blocks,
495                                   const DeviceDescription& device_description,
496                                   uint64 registers_per_thread,
497                                   uint64 shared_memory_per_block,
498                                   const ThreadDim& thread_dims,
499                                   CUfunction func) {
500   int suggested_blocks = 0;
501   int suggested_threads = 0;
502   CUresult err = cuOccupancyMaxPotentialBlockSize(
503       &suggested_blocks, &suggested_threads, func, nullptr,
504       shared_memory_per_block, 0);
505   CHECK_EQ(err, CUDA_SUCCESS);
506   if (suggested_blocks > *initial_blocks) {
507     *initial_blocks = suggested_blocks;
508     return suggested_threads;
509   } else {
510     return 0;
511   }
512 }
513 
Allocate(uint64 size,int64_t memory_space)514 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64_t memory_space) {
515   CHECK_EQ(memory_space, 0);
516   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
517 }
518 
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)519 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
520                                 uint64 size_bytes) {
521   // offset and size are in bytes, so char* works as the pointer type.
522   return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
523 }
524 
Deallocate(DeviceMemoryBase * mem)525 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
526   GpuDriver::DeviceDeallocate(context_, mem->opaque());
527 }
528 
HostMemoryRegister(void * location,uint64 size)529 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
530   if (location == nullptr || size == 0) {
531     LOG(WARNING) << "attempting to register null or zero-sized memory: "
532                  << location << "; size " << size;
533   }
534   VLOG(2) << "registering " << location << " size " << size;
535   return GpuDriver::HostRegister(context_, location, size);
536 }
537 
HostMemoryUnregister(void * location)538 bool GpuExecutor::HostMemoryUnregister(void* location) {
539   VLOG(2) << "unregistering " << location;
540   return GpuDriver::HostUnregister(context_, location);
541 }
542 
SynchronizeAllActivity()543 bool GpuExecutor::SynchronizeAllActivity() {
544   return GpuDriver::SynchronizeContext(context_);
545 }
546 
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)547 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
548                                              uint64 size) {
549   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
550       size % 4 == 0) {
551     return GpuDriver::SynchronousMemsetUint32(
552         context_, AsCudaDevicePtr(location), 0x0, size / 4);
553   }
554   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
555                                            0x0, size);
556 }
557 
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)558 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
559                                             int value, uint64 size) {
560   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
561       size % 4 == 0) {
562     // cudaMemset reinterprets "value" as a uint8.
563     uint8 byte_value = static_cast<uint8>(value);
564     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
565                      (byte_value << 8) | byte_value;
566     return GpuDriver::SynchronousMemsetUint32(
567         context_, AsCudaDevicePtr(location), pattern, size / 4);
568   }
569   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
570                                            value, size);
571 }
572 
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)573 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
574                                             const void* host_src, uint64 size) {
575   return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
576                                          host_src, size);
577 }
578 
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)579 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
580                                             const DeviceMemoryBase& gpu_src,
581                                             uint64 size) {
582   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
583                                          AsCudaDevicePtr(gpu_src), size);
584 }
585 
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)586 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
587     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
588   return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
589                                          AsCudaDevicePtr(gpu_src), size);
590 }
591 
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)592 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
593                                   uint64 size) {
594   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
595       size % 4 == 0) {
596     return Memset32(stream, location, 0x0, size);
597   } else {
598     return Memset(stream, location, 0x0, size);
599   }
600 }
601 
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)602 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
603                                  uint8 pattern, uint64 size) {
604   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
605           << " at location " << location << " with size " << size
606           << " and pattern " << std::hex << pattern;
607   return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
608                                             pattern, size,
609                                             AsGpuStreamValue(stream));
610 }
611 
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)612 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
613                                    uint32 pattern, uint64 size) {
614   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
615           << " at location " << location << " with size " << size
616           << " and pattern " << std::hex << pattern;
617   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
618         size % 4 == 0);
619   return GpuDriver::AsynchronousMemsetUint32(
620       context_, AsCudaDevicePtr(location), pattern, size / 4,
621       AsGpuStreamValue(stream));
622 }
623 
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)624 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
625                          const DeviceMemoryBase& gpu_src, uint64 size) {
626   return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
627                                           AsCudaDevicePtr(gpu_src), size,
628                                           AsGpuStreamValue(stream));
629 }
630 
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)631 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
632                          const void* host_src, uint64 size) {
633   return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
634                                           host_src, size,
635                                           AsGpuStreamValue(stream));
636 }
637 
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)638 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
639                                        DeviceMemoryBase* gpu_dst,
640                                        const DeviceMemoryBase& gpu_src,
641                                        uint64 size) {
642   return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
643                                           AsCudaDevicePtr(gpu_src), size,
644                                           AsGpuStreamValue(stream));
645 }
646 
HostCallback(Stream * stream,std::function<port::Status ()> callback)647 bool GpuExecutor::HostCallback(Stream* stream,
648                                std::function<port::Status()> callback) {
649   auto callback_ptr = new std::function<void()>([callback]() {
650     port::Status s = callback();
651     if (!s.ok()) {
652       LOG(WARNING) << "Host callback failed: " << s;
653     }
654   });
655   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
656                                       InternalHostCallback, callback_ptr);
657 }
658 
InternalHostCallback(CUstream stream,CUresult status,void * data)659 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
660                                                     CUresult status,
661                                                     void* data) {
662   std::function<void()>* callback =
663       reinterpret_cast<std::function<void()>*>(data);
664   (*callback)();
665   delete callback;
666 }
667 
AllocateEvent(Event * event)668 port::Status GpuExecutor::AllocateEvent(Event* event) {
669   return AsGpuEvent(event)->Init();
670 }
671 
DeallocateEvent(Event * event)672 port::Status GpuExecutor::DeallocateEvent(Event* event) {
673   return AsGpuEvent(event)->Destroy();
674 }
675 
RecordEvent(Stream * stream,Event * event)676 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
677   return AsGpuEvent(event)->Record(AsGpuStream(stream));
678 }
679 
WaitForEvent(Stream * stream,Event * event)680 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
681   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
682                                    AsGpuEvent(event)->gpu_event())) {
683     return port::Status::OK();
684   } else {
685     return port::Status(
686         port::error::INTERNAL,
687         absl::StrFormat("error recording waiting for CUDA event on stream %p",
688                         stream));
689   }
690 }
691 
PollForEventStatus(Event * event)692 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
693   return AsGpuEvent(event)->PollForStatus();
694 }
695 
AllocateStream(Stream * stream)696 bool GpuExecutor::AllocateStream(Stream* stream) {
697   return AsGpuStream(stream)->Init();
698 }
699 
DeallocateStream(Stream * stream)700 void GpuExecutor::DeallocateStream(Stream* stream) {
701   GpuStream* cuda_stream = AsGpuStream(stream);
702   if (!cuda_stream->IsIdle()) {
703     LOG(ERROR) << "Deallocating stream with pending work";
704   }
705   cuda_stream->Destroy();
706 }
707 
AllocateTimer(Timer * timer)708 bool GpuExecutor::AllocateTimer(Timer* timer) {
709   return AsGpuTimer(timer)->Init();
710 }
711 
DeallocateTimer(Timer * timer)712 void GpuExecutor::DeallocateTimer(Timer* timer) {
713   AsGpuTimer(timer)->Destroy();
714 }
715 
CreateStreamDependency(Stream * dependent,Stream * other)716 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
717   CUevent other_completed_event = *AsGpuStream(other)->completed_event();
718   bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
719                                    AsGpuStreamValue(other))
720                 .ok();
721   if (!ok) {
722     LOG(ERROR) << "failed to record completion event; "
723                   "therefore, failed to create inter-stream dependency";
724     return false;
725   }
726 
727   return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
728                                       other_completed_event);
729 }
730 
StartTimer(Stream * stream,Timer * timer)731 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
732   return AsGpuTimer(timer)->Start(AsGpuStream(stream));
733 }
734 
StopTimer(Stream * stream,Timer * timer)735 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
736   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
737 }
738 
BlockHostUntilDone(Stream * stream)739 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
740   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
741 }
742 
CreateBlas()743 blas::BlasSupport* GpuExecutor::CreateBlas() {
744   PluginRegistry* registry = PluginRegistry::Instance();
745   port::StatusOr<PluginRegistry::BlasFactory> status =
746       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
747                                                         plugin_config_.blas());
748   if (!status.ok()) {
749     LOG(ERROR) << "Unable to retrieve BLAS factory: "
750                << status.status().error_message();
751     return nullptr;
752   }
753 
754   return status.ValueOrDie()(this);
755 }
756 
CreateDnn()757 dnn::DnnSupport* GpuExecutor::CreateDnn() {
758   PluginRegistry* registry = PluginRegistry::Instance();
759   port::StatusOr<PluginRegistry::DnnFactory> status =
760       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
761                                                        plugin_config_.dnn());
762   if (!status.ok()) {
763     LOG(ERROR) << "Unable to retrieve DNN factory: "
764                << status.status().error_message();
765     return nullptr;
766   }
767 
768   return status.ValueOrDie()(this);
769 }
770 
CreateFft()771 fft::FftSupport* GpuExecutor::CreateFft() {
772   PluginRegistry* registry = PluginRegistry::Instance();
773   port::StatusOr<PluginRegistry::FftFactory> status =
774       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
775                                                        plugin_config_.fft());
776   if (!status.ok()) {
777     LOG(ERROR) << "Unable to retrieve FFT factory: "
778                << status.status().error_message();
779     return nullptr;
780   }
781 
782   return status.ValueOrDie()(this);
783 }
784 
CreateRng()785 rng::RngSupport* GpuExecutor::CreateRng() {
786   PluginRegistry* registry = PluginRegistry::Instance();
787   port::StatusOr<PluginRegistry::RngFactory> status =
788       registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
789                                                        plugin_config_.rng());
790   if (!status.ok()) {
791     LOG(ERROR) << "Unable to retrieve RNG factory: "
792                << status.status().error_message();
793     return nullptr;
794   }
795 
796   return status.ValueOrDie()(this);
797 }
798 
799 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const800 bool GpuExecutor::SupportsDnn() const { return true; }
801 
CanEnablePeerAccessTo(StreamExecutorInterface * other)802 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
803   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
804   return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
805 }
806 
EnablePeerAccessTo(StreamExecutorInterface * other)807 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
808   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
809   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
810 }
811 
DeviceMemoryUsage(int64 * free,int64 * total) const812 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
813   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
814 }
815 
GetSymbol(const std::string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)816 bool GpuExecutor::GetSymbol(const std::string& symbol_name,
817                             ModuleHandle module_handle, void** mem,
818                             size_t* bytes) {
819   auto lookup_in_module = [&](CUmodule module) {
820     CHECK(module != nullptr);
821     return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
822                                       reinterpret_cast<CUdeviceptr*>(mem),
823                                       bytes);
824   };
825 
826   {  // give limited scope to mutex_lock
827     absl::MutexLock lock{&in_memory_modules_mu_};
828     if (static_cast<bool>(module_handle)) {
829       auto it = gpu_binary_to_module_.find(module_handle.id());
830       CHECK(it != gpu_binary_to_module_.end());
831       return lookup_in_module(it->second.first);
832     }
833 
834     for (auto& it : gpu_binary_to_module_) {
835       if (lookup_in_module(it.second.first)) {
836         return true;
837       }
838     }
839   }
840 
841   LOG(INFO) << "Failed to find symbol in any modules: " << symbol_name;
842   return false;
843 }
844 
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)845 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
846   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
847   // we use BlockDims to express the dimensions of blocks within a grid
848   // (as opposed to ThreadDim which expresses the dimensions of threads
849   // within a block).
850   int x, y, z;
851   if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
852     return false;
853   }
854 
855   block_dim_limit->x = x;
856   block_dim_limit->y = y;
857   block_dim_limit->z = z;
858   return true;
859 }
860 
SupportsBlas() const861 bool GpuExecutor::SupportsBlas() const { return true; }
862 
SupportsFft() const863 bool GpuExecutor::SupportsFft() const { return true; }
864 
SupportsRng() const865 bool GpuExecutor::SupportsRng() const { return true; }
866 
867 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()868 GpuExecutor::CreateEventImplementation() {
869   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
870 }
871 
872 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()873 GpuExecutor::CreateKernelImplementation() {
874   return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
875 }
876 
877 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()878 GpuExecutor::GetStreamImplementation() {
879   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
880 }
881 
882 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()883 GpuExecutor::GetTimerImplementation() {
884   return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
885 }
886 
GpuContextHack()887 void* GpuExecutor::GpuContextHack() { return context_; }
888 
gpu_context()889 GpuContext* GpuExecutor::gpu_context() { return context_; }
890 
891 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
892 // of SysFS. Returns -1 if it cannot.
893 //
894 // For anything more complicated/prod-focused than this, you'll likely want to
895 // turn to gsys' topology modeling.
TryToReadNumaNode(const std::string & pci_bus_id,int device_ordinal)896 static int TryToReadNumaNode(const std::string& pci_bus_id,
897                              int device_ordinal) {
898 #if defined(__APPLE__)
899   LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
900   return 0;
901 #elif defined(PLATFORM_WINDOWS)
902   // Windows support for NUMA is not currently implemented. Return node 0.
903   return 0;
904 #elif defined(__aarch64__)
905   LOG(INFO) << "ARM64 does not support NUMA - returning NUMA node zero";
906   return 0;
907 #else
908   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
909   static const int kUnknownNumaNode = -1;
910 
911   if (pci_bus_id.empty()) {
912     LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
913     return kUnknownNumaNode;
914   }
915 
916   std::string filename =
917       absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
918 
919   // We have to use fopen/fread here so that the device properties can be
920   // populated before InitGoogle procedure has been completed (at which point we
921   // could use the file::* utilities).
922   FILE* file = fopen(filename.c_str(), "r");
923   if (file == nullptr) {
924     LOG(INFO) << "could not open file to read NUMA node: " << filename
925               << "\nYour kernel may have been built without NUMA support.";
926     return kUnknownNumaNode;
927   }
928 
929   std::string content;
930   char buf[32];
931   size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
932   buf[did_read] = '\0';
933   content = buf;
934 
935   int32_t value;
936   if (port::safe_strto32(content, &value)) {
937     if (value < 0) {  // See http://b/18228951 for details on this path.
938       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
939                 << value
940                 << "), but there must be at least one NUMA node"
941                    ", so returning NUMA node zero";
942       fclose(file);
943       return 0;
944     }
945     fclose(file);
946     return value;
947   }
948 
949   LOG(WARNING)
950       << "could not convert SysFS file contents to integral NUMA node value: "
951       << content;
952 
953   fclose(file);
954   return kUnknownNumaNode;
955 #endif
956 }
957 
958 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)959 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
960   GpuDeviceHandle device;
961   auto status = GpuDriver::GetDevice(device_ordinal, &device);
962   if (!status.ok()) {
963     return status;
964   }
965 
966   int cc_major;
967   int cc_minor;
968   status = GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device);
969   if (!status.ok()) {
970     return status;
971   }
972 
973   internal::DeviceDescriptionBuilder builder;
974 
975   {
976     int driver_version = 0;
977     (void)GpuDriver::GetDriverVersion(&driver_version);
978     std::string augmented_driver_version = absl::StrFormat(
979         "%d (%s)", driver_version,
980         cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion()));
981     builder.set_driver_version(augmented_driver_version);
982   }
983 
984   {
985     std::string pci_bus_id = GpuDriver::GetPCIBusID(device);
986 
987     // Lower the hex characters to match sysfs.
988     pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
989     builder.set_pci_bus_id(pci_bus_id);
990 
991     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
992     int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
993     builder.set_numa_node(numa_node);
994   }
995 
996   {
997     builder.set_threads_per_block_limit(
998         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
999                                       device)
1000             .ValueOrDie());
1001 
1002     ThreadDim thread_dim_limit;
1003     thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1004                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device)
1005                              .ValueOrDie();
1006     thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1007                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device)
1008                              .ValueOrDie();
1009     thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1010                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
1011                              .ValueOrDie();
1012     builder.set_thread_dim_limit(thread_dim_limit);
1013 
1014     int clock_rate =
1015         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
1016             .ValueOrDie();
1017     builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1018   }
1019 
1020   {
1021     bool ecc_enabled = false;
1022     (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
1023     builder.set_ecc_enabled(ecc_enabled);
1024   }
1025 
1026   {
1027     uint64 device_memory_size = -1;
1028     (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
1029     builder.set_device_memory_size(device_memory_size);
1030   }
1031 
1032   port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1033       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
1034   port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1035       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
1036   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1037     // Times 2 because HBM is DDR memory; it gets two data bits per each data
1038     // lane.
1039     builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1040                                  1000 *
1041                                  int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1042   }
1043 
1044   {
1045     BlockDim block_dim_limit;
1046     FillBlockDimLimit(device, &block_dim_limit);
1047     builder.set_block_dim_limit(block_dim_limit);
1048   }
1049 
1050   {
1051     std::string device_name;
1052     TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
1053     builder.set_name(device_name);
1054   }
1055 
1056   builder.set_platform_version(
1057       absl::StrCat("Compute Capability ", cc_major, ".", cc_minor));
1058 
1059   // TODO(leary) should be a way to query this from the driver, but this is
1060   // unlikely to change for us any time soon.
1061   builder.set_device_address_bits(64);
1062 
1063   builder.set_device_vendor("NVIDIA Corporation");
1064   builder.set_cuda_compute_capability(cc_major, cc_minor);
1065   builder.set_shared_memory_per_core(
1066       GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
1067   builder.set_shared_memory_per_block(
1068       GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
1069   builder.set_core_count(
1070       GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
1071   builder.set_threads_per_core_limit(
1072       GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
1073   builder.set_registers_per_block_limit(
1074       GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
1075   builder.set_threads_per_warp(
1076       GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
1077   builder.set_registers_per_core_limit(
1078       GpuDriver::GetDeviceAttribute(
1079           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
1080           .ValueOrDie());
1081 
1082   return builder.Build();
1083 }
1084 
1085 }  // namespace gpu
1086 
1087 }  // namespace stream_executor
1088 
1089 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
1090