• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <unistd.h>
17 
18 #include "absl/base/casts.h"
19 #include "absl/strings/ascii.h"
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/str_join.h"
23 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
24 #include "tensorflow/stream_executor/gpu/gpu_event.h"
25 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
26 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
27 #include "tensorflow/stream_executor/gpu/gpu_timer.h"
28 #include "tensorflow/stream_executor/kernel_cache_config.h"
29 #include "tensorflow/stream_executor/lib/env.h"
30 #include "tensorflow/stream_executor/lib/error.h"
31 #include "tensorflow/stream_executor/lib/initialize.h"
32 #include "tensorflow/stream_executor/lib/mathutil.h"
33 #include "tensorflow/stream_executor/lib/numbers.h"
34 #include "tensorflow/stream_executor/lib/path.h"
35 #include "tensorflow/stream_executor/lib/process_state.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/dso_loader.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/port.h"
41 #include "tensorflow/stream_executor/plugin_registry.h"
42 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
43 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
44 #include "tensorflow/stream_executor/stream.h"
45 #include "tensorflow/stream_executor/stream_executor_internal.h"
46 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
47 #include "tensorflow/stream_executor/timer.h"
48 
49 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
50 #error \
51     "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
52 #endif
53 
54 #ifdef __ROCM_RUNTIME_H__
55 #error \
56     "ROCM runtime being included into ROCM GPU executor; should be driver only."
57 #endif
58 
59 namespace stream_executor {
60 namespace gpu {
61 
AsGpuEvent(Event * event)62 static GpuEvent* AsGpuEvent(Event* event) {
63   DCHECK(event != nullptr);
64   return static_cast<GpuEvent*>(event->implementation());
65 }
66 
67 // Given a platform-independent timer datatype, returns the internal ROCM
68 // platform implementation pointer.
AsGpuTimer(Timer * timer)69 static GpuTimer* AsGpuTimer(Timer* timer) {
70   DCHECK(timer != nullptr);
71   return static_cast<GpuTimer*>(timer->implementation());
72 }
73 
74 // Given const GPU memory, returns a librocm device pointer datatype, suitable
75 // for passing directly to librocm APIs.
76 //
77 // N.B. we must lose constness in order to pass a suitable type to the existing
78 // librocm APIs, so the caller should take care to only pass the result of const
79 // GPU memory conversions to librocm functions which will honor constness.
AsROCmDevicePtr(const DeviceMemoryBase & gpu_mem)80 static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
81   return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
82 }
83 
84 // See description on const version above.
AsROCmDevicePtr(DeviceMemoryBase * gpu_mem)85 static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
86   return AsROCmDevicePtr(*gpu_mem);
87 }
88 
GetGpuContext(Stream * stream)89 static GpuContext* GetGpuContext(Stream* stream) {
90   return static_cast<GpuExecutor*>(stream->parent()->implementation())
91       ->gpu_context();
92 }
93 
ExtractGpuContext(GpuExecutor * rocm_exec)94 GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
95   CHECK(rocm_exec != nullptr);
96   return rocm_exec->gpu_context();
97 }
98 
ExtractGpuExecutor(StreamExecutor * stream_exec)99 GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
100   return static_cast<GpuExecutor*>(stream_exec->implementation());
101 }
102 
~GpuExecutor()103 GpuExecutor::~GpuExecutor() {
104   for (auto& it : disk_modules_) {
105     GpuDriver::UnloadModule(context_, it.second);
106   }
107   for (auto& it : in_memory_modules_) {
108     GpuDriver::UnloadModule(context_, it.second);
109   }
110   if (context_ != nullptr) {
111     GpuDriver::DestroyContext(context_);
112   }
113   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
114   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
115 }
UnloadModule(ModuleHandle module_handle)116 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
117   const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
118   absl::MutexLock lock{&in_memory_modules_mu_};
119   return UnloadGpuBinary(gpu_binary);
120 }
121 
UnloadGpuBinary(const void * gpu_binary)122 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
123   auto module_it = gpu_binary_to_module_.find(gpu_binary);
124   if (gpu_binary_to_module_.end() == module_it) {
125     VLOG(3) << "No loaded  HSACO module for " << gpu_binary;
126     return false;
127   }
128   auto& module = module_it->second.first;
129   auto& refcount = module_it->second.second;
130   VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
131   if (--refcount == 0) {
132     VLOG(3) << "Unloading  HSACO module " << module;
133     GpuDriver::UnloadModule(context_, module);
134     gpu_binary_to_module_.erase(module_it);
135     const char* mem_it = nullptr;
136     for (auto x : in_memory_modules_) {
137       if (x.second == module) mem_it = x.first;
138     }
139     if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
140   }
141   return true;
142 }
143 
UnloadKernel(const KernelBase * kernel)144 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
145   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
146 
147   absl::MutexLock lock{&in_memory_modules_mu_};
148   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
149   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
150     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
151             << " has never been loaded.";
152     return;  // We've never seen this kernel.
153   }
154   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
155           << " has loaded GPU code " << gpu_binary_it->second;
156   UnloadGpuBinary(gpu_binary_it->second);
157   kernel_to_gpu_binary_.erase(gpu_binary_it);
158 }
159 
Init(int device_ordinal,DeviceOptions device_options)160 port::Status GpuExecutor::Init(int device_ordinal,
161                                DeviceOptions device_options) {
162   device_ordinal_ = device_ordinal;
163 
164   auto status = GpuDriver::Init();
165   if (!status.ok()) {
166     return status;
167   }
168 
169   status = GpuDriver::GetDevice(device_ordinal_, &device_);
170   if (!status.ok()) {
171     return status;
172   }
173 
174   status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
175                                     &context_);
176   if (!status.ok()) {
177     return status;
178   }
179 
180   return GpuDriver::GetGpuISAVersion(&version_, device_);
181 }
182 
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const183 bool GpuExecutor::FindOnDiskForComputeCapability(
184     absl::string_view filename, absl::string_view canonical_suffix,
185     string* found_filename) const {
186   LOG(FATAL) << "Feature not supported on ROCM platform "
187                 "(FindOnDiskForComputeCapability)";
188   return false;
189 }
190 
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const191 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
192                                           absl::string_view canonical_suffix,
193                                           string* found_filename) const {
194   if (version_ == 0) {
195     return false;
196   }
197 
198   string cc_specific =
199       absl::StrCat(filename, ".cc", version_, canonical_suffix);
200   if (port::FileExists(cc_specific).ok()) {
201     VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
202             << cc_specific;
203     *found_filename = cc_specific;
204     return true;
205   }
206 
207   VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
208           << cc_specific;
209   if (port::FileExists(string(filename)).ok()) {
210     *found_filename = string(filename);
211     return true;
212   }
213 
214   return false;
215 }
216 
217 // Returns the path to the running executable.
218 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
219 // Arg: strip_exe: if true, remove the name of the executable itself from the
220 //                 returned string. Example: calling this from /usr/bin/foo
221 //                 would return /usr/bin.
GetBinaryDir(bool strip_exe)222 static string GetBinaryDir(bool strip_exe) {
223   char exe_path[PATH_MAX] = {0};
224   PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
225   // Make sure it's null-terminated:
226   exe_path[sizeof(exe_path) - 1] = 0;
227 
228   if (strip_exe) {
229     // The exe is the last component of the path, so remove one component.
230     string ret = exe_path;
231     std::vector<string> components = absl::StrSplit(exe_path, '/');
232     components.pop_back();
233     return absl::StrJoin(components, "/");
234   }
235   return exe_path;
236 }
237 
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)238 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
239                                     KernelBase* kernel) {
240   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
241   hipModule_t module = nullptr;
242   const string* kernelname;
243 
244   const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
245   bool has_cubin = spec.has_cuda_cubin_on_disk();
246   if (has_cubin) {
247     on_disk_spec = &spec.cuda_cubin_on_disk();
248   }
249 
250   if (on_disk_spec != nullptr) {
251     return port::InternalError(
252         "Loading ROCM kernel from disk is not supported");
253   } else if (spec.has_cuda_cubin_in_memory()) {
254     kernelname = &spec.cuda_cubin_in_memory().kernelname();
255 
256     const char* hsaco = spec.cuda_cubin_in_memory().bytes();
257     absl::MutexLock lock{&in_memory_modules_mu_};
258     module = in_memory_modules_[hsaco];
259 
260     if (module == nullptr) {
261       TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, &module));
262     }
263     kernel_to_gpu_binary_[kernel] = hsaco;
264   } else {
265     return port::InternalError("No method of loading ROCM kernel provided");
266   }
267 
268   VLOG(2) << "getting function " << *kernelname << " from module " << module;
269   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
270                                     rocm_kernel->gpu_function_ptr())) {
271     return port::InternalError("Failed getting module function");
272   }
273 
274   // We have to trust the kernel loader spec arity because there doesn't appear
275   // to be a way to reflect on the number of expected arguments w/the ROCM API.
276   rocm_kernel->set_arity(spec.arity());
277 
278   KernelMetadata kernel_metadata;
279   TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
280   kernel->set_metadata(kernel_metadata);
281   kernel->set_name(*kernelname);
282   return port::Status::OK();
283 }
284 
GetKernelMetadata(GpuKernel * rocm_kernel,KernelMetadata * kernel_metadata)285 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
286                                             KernelMetadata* kernel_metadata) {
287   int value = 0;
288   // TODO(ROCm) implement this feature in HIP
289   kernel_metadata->set_registers_per_thread(value);
290 
291   // TODO(ROCm) implement this feature in HIP
292   kernel_metadata->set_shared_memory_bytes(value);
293   return port::Status::OK();
294 }
295 
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)296 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
297                                  const BlockDim& block_dims,
298                                  const KernelBase& kernel,
299                                  const KernelArgsArrayBase& args) {
300   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
301   GpuStreamHandle hipstream = AsGpuStreamValue(stream);
302   const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
303   hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
304 
305   // Only perform/print the occupancy check once.  Even just checking to see
306   // whether we've done an occupancy check on this kernel before isn't free
307   // (because we have to synchronize), so we only do this at -v 2+.
308   if (VLOG_IS_ON(2)) {
309     absl::MutexLock lock(&launched_kernels_mu_);
310     if (!launched_kernels_.count(hipfunc)) {
311       VlogOccupancyInfo(kernel, thread_dims, block_dims);
312       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
313       // expose a kernel/module deallocation method.
314       launched_kernels_.insert(hipfunc);
315     }
316   }
317 
318   if (rocm_kernel->GetPreferredCacheConfig() !=
319       KernelCacheConfig::kNoPreference) {
320     TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
321         hipfunc, rocm_kernel->GetGpuCacheConfig()));
322   }
323 
324   // prepare kernargs
325   // KernelArgsArrayBase keeps the pointer of arguments
326   // deference them here
327   std::vector<void*> kernargs;
328   KernelArgIterator iter = args.arg_iterator();
329   while (iter.has_next()) {
330     KernelArg arg = iter.next();
331     VLOG(2) << "*(arg.address): "
332             << reinterpret_cast<void*>(
333                    *static_cast<const uint64_t*>(arg.address));
334     kernargs.push_back(
335         reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
336   }
337 
338   size_t size = sizeof(void*) * kernargs.size();
339   void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
340                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
341 
342   return GpuDriver::LaunchKernel(
343       GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y, block_dims.z,
344       thread_dims.x, thread_dims.y, thread_dims.z,
345       args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
346 }
347 
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)348 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
349                                     uint64 registers_per_thread,
350                                     uint64 shared_memory_per_block,
351                                     const ThreadDim& thread_dims,
352                                     GpuFunctionHandle func) {
353   LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
354   return 0;
355 }
356 
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)357 int GpuExecutor::CompareOccupancy(int* initial_blocks,
358                                   const DeviceDescription& device_description,
359                                   uint64 registers_per_thread,
360                                   uint64 shared_memory_per_block,
361                                   const ThreadDim& thread_dims,
362                                   GpuFunctionHandle func) {
363   LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
364   return 0;
365 }
366 
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)367 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
368                                      ModuleHandle* module_handle) {
369   // In GpuExecutor we store the pointer to the  HSACO binary  as
370   // ModuleHandle::id().
371   hipModule_t hip_module = nullptr;
372   // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
373   if (spec.has_cuda_cubin_in_memory()) {
374     absl::MutexLock lock{&in_memory_modules_mu_};
375     TF_RETURN_IF_ERROR(LoadModuleFromHsaco(
376         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
377         &hip_module));
378     *module_handle = ModuleHandle(const_cast<void*>(
379         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
380     return port::Status::OK();
381   } else {
382     return port::InternalError("No HASCO binary found");
383   }
384 }
385 
LoadModuleFromCuBin(const char * cubin,hipModule_t * module)386 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
387                                               hipModule_t* module) {
388   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
389 }
390 
LoadModuleFromPtx(const char * ptx,hipModule_t * module)391 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
392                                             hipModule_t* module) {
393   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
394 }
395 
LoadModuleFromHsaco(const char * hsaco,hipModule_t * module)396 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
397                                               hipModule_t* module) {
398   uint64_t module_refcount;
399   std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
400 
401   if (*module == nullptr) {
402     TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, module));
403     module_refcount = 1;
404     in_memory_modules_[hsaco] = *module;
405     VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
406             << " as module " << *module;
407   } else {
408     ++module_refcount;
409     VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
410             << " is already loaded as module " << *module;
411   }
412   gpu_binary_to_module_[hsaco] = {*module, module_refcount};
413   return port::Status::OK();
414 }
415 
416 // This is a non-essential operation; if there's a failure, proceed without
417 // logging an error. It's nearly certain that in case of failures, we'd never
418 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)419 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
420                                     const ThreadDim& thread_dims,
421                                     const BlockDim& block_dims) {
422   // TODO(ROCm) implement this feature in HIP
423 }
424 
Allocate(uint64 size,int64 memory_space)425 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
426   CHECK_EQ(memory_space, 0);
427   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
428 }
429 
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)430 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
431                                 uint64 size_bytes) {
432   // offset and size are in bytes, so char* works as the pointer type.
433   return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
434 }
435 
Deallocate(DeviceMemoryBase * mem)436 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
437   GpuDriver::DeviceDeallocate(context_, mem->opaque());
438 }
439 
HostMemoryRegister(void * location,uint64 size)440 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
441   if (location == nullptr || size == 0) {
442     LOG(WARNING) << "attempting to register null or zero-sized memory: "
443                  << location << "; size " << size;
444   }
445   VLOG(2) << "registering " << location << " size " << size;
446   return GpuDriver::HostRegister(context_, location, size);
447 }
448 
HostMemoryUnregister(void * location)449 bool GpuExecutor::HostMemoryUnregister(void* location) {
450   VLOG(2) << "unregistering " << location;
451   return GpuDriver::HostUnregister(context_, location);
452 }
453 
SynchronizeAllActivity()454 bool GpuExecutor::SynchronizeAllActivity() {
455   return GpuDriver::SynchronizeContext(context_);
456 }
457 
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)458 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
459                                              uint64 size) {
460   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
461       size % 4 == 0) {
462     return GpuDriver::SynchronousMemsetUint32(
463         context_, AsROCmDevicePtr(location), 0x0, size / 4);
464   }
465   return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
466                                            0x0, size);
467 }
468 
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)469 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
470                                             int value, uint64 size) {
471   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
472       size % 4 == 0) {
473     // hipMemset reinterprets "value" as a uint8.
474     uint8 byte_value = static_cast<uint8>(value);
475     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
476                      (byte_value << 8) | byte_value;
477     return GpuDriver::SynchronousMemsetUint32(
478         context_, AsROCmDevicePtr(location), pattern, size / 4);
479   }
480   return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
481                                            value, size);
482 }
483 
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)484 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
485                                             const void* host_src, uint64 size) {
486   return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
487                                          host_src, size);
488 }
489 
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)490 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
491                                             const DeviceMemoryBase& gpu_src,
492                                             uint64 size) {
493   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
494                                          AsROCmDevicePtr(gpu_src), size);
495 }
496 
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)497 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
498     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
499   return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
500                                          AsROCmDevicePtr(gpu_src), size);
501 }
502 
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)503 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
504                                   uint64 size) {
505   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
506       size % 4 == 0) {
507     return Memset32(stream, location, 0x0, size);
508   } else {
509     return Memset(stream, location, 0x0, size);
510   }
511 }
512 
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)513 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
514                                  uint8 pattern, uint64 size) {
515   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
516           << " at location " << location << " with size " << size
517           << " and pattern " << std::hex << pattern;
518   return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
519                                             pattern, size,
520                                             AsGpuStreamValue(stream));
521 }
522 
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)523 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
524                                    uint32 pattern, uint64 size) {
525   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
526           << " at location " << location << " with size " << size
527           << " and pattern " << std::hex << pattern;
528   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
529         size % 4 == 0);
530   return GpuDriver::AsynchronousMemsetUint32(
531       context_, AsROCmDevicePtr(location), pattern, size / 4,
532       AsGpuStreamValue(stream));
533 }
534 
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)535 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
536                          const DeviceMemoryBase& gpu_src, uint64 size) {
537   return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
538                                           AsROCmDevicePtr(gpu_src), size,
539                                           AsGpuStreamValue(stream));
540 }
541 
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)542 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
543                          const void* host_src, uint64 size) {
544   return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
545                                           host_src, size,
546                                           AsGpuStreamValue(stream));
547 }
548 
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)549 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
550                                        DeviceMemoryBase* gpu_dst,
551                                        const DeviceMemoryBase& gpu_src,
552                                        uint64 size) {
553   return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
554                                           AsROCmDevicePtr(gpu_src), size,
555                                           AsGpuStreamValue(stream));
556 }
557 
HostCallback(Stream * stream,std::function<port::Status ()> callback)558 bool GpuExecutor::HostCallback(Stream* stream,
559                                std::function<port::Status()> callback) {
560   auto callback_ptr = new std::function<void()>([callback]() {
561     port::Status s = callback();
562     if (!s.ok()) {
563       LOG(WARNING) << "Host callback failed: " << s;
564     }
565   });
566   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
567                                       InternalHostCallback, callback_ptr);
568 }
569 
InternalHostCallback(GpuStreamHandle stream,hipError_t status,void * data)570 /* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
571                                                     hipError_t status,
572                                                     void* data) {
573   std::function<void()>* callback =
574       reinterpret_cast<std::function<void()>*>(data);
575   (*callback)();
576   delete callback;
577 }
578 
AllocateEvent(Event * event)579 port::Status GpuExecutor::AllocateEvent(Event* event) {
580   return AsGpuEvent(event)->Init();
581 }
582 
DeallocateEvent(Event * event)583 port::Status GpuExecutor::DeallocateEvent(Event* event) {
584   return AsGpuEvent(event)->Destroy();
585 }
586 
RecordEvent(Stream * stream,Event * event)587 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
588   return AsGpuEvent(event)->Record(AsGpuStream(stream));
589 }
590 
WaitForEvent(Stream * stream,Event * event)591 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
592   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
593                                    AsGpuEvent(event)->gpu_event())) {
594     return port::Status::OK();
595   } else {
596     return port::Status{
597         port::error::INTERNAL,
598         absl::StrFormat("error recording waiting for ROCM event on stream %p",
599                         stream)};
600   }
601 }
602 
PollForEventStatus(Event * event)603 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
604   return AsGpuEvent(event)->PollForStatus();
605 }
606 
AllocateStream(Stream * stream)607 bool GpuExecutor::AllocateStream(Stream* stream) {
608   return AsGpuStream(stream)->Init();
609 }
610 
DeallocateStream(Stream * stream)611 void GpuExecutor::DeallocateStream(Stream* stream) {
612   GpuStream* rocm_stream = AsGpuStream(stream);
613   if (!rocm_stream->IsIdle()) {
614     LOG(ERROR) << "Deallocating stream with pending work";
615   }
616   rocm_stream->Destroy();
617 }
618 
AllocateTimer(Timer * timer)619 bool GpuExecutor::AllocateTimer(Timer* timer) {
620   return AsGpuTimer(timer)->Init();
621 }
622 
DeallocateTimer(Timer * timer)623 void GpuExecutor::DeallocateTimer(Timer* timer) {
624   AsGpuTimer(timer)->Destroy();
625 }
626 
CreateStreamDependency(Stream * dependent,Stream * other)627 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
628   GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
629   bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
630                                    AsGpuStreamValue(other))
631                 .ok();
632   if (!ok) {
633     LOG(ERROR) << "failed to record completion event; "
634                   "therefore, failed to create inter-stream dependency";
635     return false;
636   }
637 
638   return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
639                                       other_completed_event);
640 }
641 
StartTimer(Stream * stream,Timer * timer)642 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
643   return AsGpuTimer(timer)->Start(AsGpuStream(stream));
644 }
645 
StopTimer(Stream * stream,Timer * timer)646 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
647   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
648 }
649 
BlockHostUntilDone(Stream * stream)650 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
651   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
652 }
653 
CreateBlas()654 blas::BlasSupport* GpuExecutor::CreateBlas() {
655   PluginRegistry* registry = PluginRegistry::Instance();
656   port::StatusOr<PluginRegistry::BlasFactory> status =
657       registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
658                                                         plugin_config_.blas());
659   if (!status.ok()) {
660     LOG(ERROR) << "Unable to retrieve BLAS factory: "
661                << status.status().error_message();
662     return nullptr;
663   }
664 
665   return status.ValueOrDie()(this);
666 }
667 
CreateDnn()668 dnn::DnnSupport* GpuExecutor::CreateDnn() {
669   PluginRegistry* registry = PluginRegistry::Instance();
670   port::StatusOr<PluginRegistry::DnnFactory> status =
671       registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
672                                                        plugin_config_.dnn());
673   if (!status.ok()) {
674     LOG(ERROR) << "Unable to retrieve DNN factory: "
675                << status.status().error_message();
676     return nullptr;
677   }
678 
679   return status.ValueOrDie()(this);
680 }
681 
CreateFft()682 fft::FftSupport* GpuExecutor::CreateFft() {
683   PluginRegistry* registry = PluginRegistry::Instance();
684   port::StatusOr<PluginRegistry::FftFactory> status =
685       registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
686                                                        plugin_config_.fft());
687   if (!status.ok()) {
688     LOG(ERROR) << "Unable to retrieve FFT factory: "
689                << status.status().error_message();
690     return nullptr;
691   }
692 
693   return status.ValueOrDie()(this);
694 }
695 
CreateRng()696 rng::RngSupport* GpuExecutor::CreateRng() {
697   PluginRegistry* registry = PluginRegistry::Instance();
698   port::StatusOr<PluginRegistry::RngFactory> status =
699       registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
700                                                        plugin_config_.rng());
701   if (!status.ok()) {
702     LOG(ERROR) << "Unable to retrieve RNG factory: "
703                << status.status().error_message();
704     return nullptr;
705   }
706 
707   return status.ValueOrDie()(this);
708 }
709 
710 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const711 bool GpuExecutor::SupportsDnn() const { return true; }
712 
CanEnablePeerAccessTo(StreamExecutorInterface * other)713 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
714   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
715   return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
716 }
717 
EnablePeerAccessTo(StreamExecutorInterface * other)718 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
719   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
720   return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
721 }
722 
DeviceMemoryUsage(int64 * free,int64 * total) const723 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
724   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
725 }
726 
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)727 bool GpuExecutor::GetSymbol(const string& symbol_name,
728                             ModuleHandle module_handle, void** mem,
729                             size_t* bytes) {
730   absl::MutexLock lock{&in_memory_modules_mu_};
731   if (static_cast<bool>(module_handle)) {
732     auto it = gpu_binary_to_module_.find(module_handle.id());
733     CHECK(it != gpu_binary_to_module_.end());
734     if (GpuDriver::GetModuleSymbol(
735             context_, it->second.first, symbol_name.c_str(),
736             reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
737       return true;
738     }
739   }
740 
741   for (auto& it : gpu_binary_to_module_) {
742     if (GpuDriver::GetModuleSymbol(
743             context_, it.second.first, symbol_name.c_str(),
744             reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
745       return true;
746     }
747   }
748 
749   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
750   return false;
751 }
752 
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)753 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
754   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
755   // we use BlockDims to express the dimensions of blocks within a grid
756   // (as opposed to ThreadDim which expresses the dimensions of threads
757   // within a block).
758   int x, y, z;
759   if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
760     return false;
761   }
762 
763   block_dim_limit->x = x;
764   block_dim_limit->y = y;
765   block_dim_limit->z = z;
766   return true;
767 }
768 
SupportsBlas() const769 bool GpuExecutor::SupportsBlas() const { return true; }
770 
SupportsFft() const771 bool GpuExecutor::SupportsFft() const { return true; }
772 
SupportsRng() const773 bool GpuExecutor::SupportsRng() const { return true; }
774 
775 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()776 GpuExecutor::CreateEventImplementation() {
777   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
778 }
779 
780 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()781 GpuExecutor::CreateKernelImplementation() {
782   return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
783 }
784 
785 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()786 GpuExecutor::GetStreamImplementation() {
787   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
788 }
789 
790 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()791 GpuExecutor::GetTimerImplementation() {
792   return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
793 }
794 
GpuContextHack()795 void* GpuExecutor::GpuContextHack() { return context_; }
796 
gpu_context()797 GpuContext* GpuExecutor::gpu_context() { return context_; }
798 
799 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
800 // of SysFS. Returns -1 if it cannot.
801 //
802 // For anything more complicated/prod-focused than this, you'll likely want to
803 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)804 static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
805   // TODO(ROCm) implement this feature in HIP
806   return 1;
807 }
808 
809 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)810 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
811   GpuDeviceHandle device;
812   auto status = GpuDriver::GetDevice(device_ordinal, &device);
813   if (!status.ok()) {
814     return status;
815   }
816 
817   int version;
818   status = GpuDriver::GetGpuISAVersion(&version, device);
819   if (!status.ok()) {
820     return status;
821   }
822 
823   std::string gcn_arch_name;
824   status = GpuDriver::GetGpuGCNArchName(device, &gcn_arch_name);
825   if (!status.ok()) {
826     return status;
827   }
828 
829   internal::DeviceDescriptionBuilder builder;
830 
831   {
832     int driver_version = 0;
833     (void)GpuDriver::GetDriverVersion(&driver_version);
834     string augmented_driver_version = absl::StrFormat(
835         "%d (%s)", driver_version,
836         rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
837             .c_str());
838     builder.set_driver_version(augmented_driver_version);
839   }
840 
841   {
842     string pci_bus_id = GpuDriver::GetPCIBusID(device);
843 
844     // Lower the hex characters to match sysfs.
845     pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
846     builder.set_pci_bus_id(pci_bus_id);
847 
848     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
849     int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
850     builder.set_numa_node(numa_node);
851   }
852 
853   hipDeviceProp_t prop;
854   if (GpuDriver::GetDeviceProperties(&prop, device_ordinal)) {
855     builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
856 
857     ThreadDim thread_dim_limit;
858     thread_dim_limit.x = prop.maxThreadsDim[0];
859     thread_dim_limit.y = prop.maxThreadsDim[1];
860     thread_dim_limit.z = prop.maxThreadsDim[2];
861     builder.set_thread_dim_limit(thread_dim_limit);
862 
863     float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
864     builder.set_clock_rate_ghz(clock_rate_ghz);
865 
866     // mem_bandwidth = 2 * mem_bus_width_in_bytes * mem_clock_rate_in_hz
867     int64 memory_bandwidth = 2 * (int64(prop.memoryBusWidth) / 8) *
868                              (int64(prop.memoryClockRate) * 1000);
869     builder.set_memory_bandwidth(memory_bandwidth);
870   }
871 
872   {
873     bool ecc_enabled = false;
874     (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
875     builder.set_ecc_enabled(ecc_enabled);
876   }
877 
878   {
879     uint64 device_memory_size = -1;
880     (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
881     builder.set_device_memory_size(device_memory_size);
882   }
883 
884   {
885     BlockDim block_dim_limit;
886     FillBlockDimLimit(device, &block_dim_limit);
887     builder.set_block_dim_limit(block_dim_limit);
888   }
889 
890   {
891     string device_name;
892     TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
893     builder.set_name(device_name);
894   }
895 
896   builder.set_platform_version(
897       absl::StrCat("AMDGPU ISA version: ", gcn_arch_name));
898 
899   // TODO(leary) should be a way to query this from the driver, but this is
900   // unlikely to change for us any time soon.
901   builder.set_device_address_bits(64);
902 
903   builder.set_device_vendor("Advanced Micro Devices, Inc");
904   builder.set_rocm_amdgpu_isa_version(version);
905   builder.set_rocm_amdgpu_gcn_arch_name(gcn_arch_name);
906 
907   builder.set_shared_memory_per_core(
908       GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
909   builder.set_shared_memory_per_block(
910       GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
911   builder.set_core_count(
912       GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
913   builder.set_threads_per_core_limit(
914       GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
915   builder.set_registers_per_block_limit(
916       GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
917   builder.set_threads_per_warp(
918       GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
919   builder.set_registers_per_core_limit(64 * 1024);
920 
921   return builder.Build();
922 }
923 
924 }  // namespace gpu
925 
926 }  // namespace stream_executor
927 
928 REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {});
929