• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <unistd.h>
17 
18 #include "absl/base/casts.h"
19 #include "absl/strings/ascii.h"
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/str_join.h"
23 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
24 #include "tensorflow/stream_executor/gpu/gpu_event.h"
25 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
26 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
27 #include "tensorflow/stream_executor/gpu/gpu_timer.h"
28 #include "tensorflow/stream_executor/kernel_cache_config.h"
29 #include "tensorflow/stream_executor/lib/env.h"
30 #include "tensorflow/stream_executor/lib/error.h"
31 #include "tensorflow/stream_executor/lib/initialize.h"
32 #include "tensorflow/stream_executor/lib/mathutil.h"
33 #include "tensorflow/stream_executor/lib/numbers.h"
34 #include "tensorflow/stream_executor/lib/path.h"
35 #include "tensorflow/stream_executor/lib/process_state.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/dso_loader.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/port.h"
41 #include "tensorflow/stream_executor/plugin_registry.h"
42 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
43 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
44 #include "tensorflow/stream_executor/stream.h"
45 #include "tensorflow/stream_executor/stream_executor_internal.h"
46 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
47 #include "tensorflow/stream_executor/timer.h"
48 
49 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
50 #error \
51     "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
52 #endif
53 
54 #ifdef __ROCM_RUNTIME_H__
55 #error \
56     "ROCM runtime being included into ROCM GPU executor; should be driver only."
57 #endif
58 
59 namespace stream_executor {
60 namespace gpu {
61 
AsGpuEvent(Event * event)62 static GpuEvent* AsGpuEvent(Event* event) {
63   DCHECK(event != nullptr);
64   return static_cast<GpuEvent*>(event->implementation());
65 }
66 
67 // Given a platform-independent timer datatype, returns the internal ROCM
68 // platform implementation pointer.
AsGpuTimer(Timer * timer)69 static GpuTimer* AsGpuTimer(Timer* timer) {
70   DCHECK(timer != nullptr);
71   return static_cast<GpuTimer*>(timer->implementation());
72 }
73 
74 // Given const GPU memory, returns a librocm device pointer datatype, suitable
75 // for passing directly to librocm APIs.
76 //
77 // N.B. we must lose constness in order to pass a suitable type to the existing
78 // librocm APIs, so the caller should take care to only pass the result of const
79 // GPU memory conversions to librocm functions which will honor constness.
AsROCmDevicePtr(const DeviceMemoryBase & gpu_mem)80 static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
81   return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
82 }
83 
84 // See description on const version above.
AsROCmDevicePtr(DeviceMemoryBase * gpu_mem)85 static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
86   return AsROCmDevicePtr(*gpu_mem);
87 }
88 
GetGpuContext(Stream * stream)89 static GpuContext* GetGpuContext(Stream* stream) {
90   return static_cast<GpuExecutor*>(stream->parent()->implementation())
91       ->gpu_context();
92 }
93 
ExtractGpuContext(GpuExecutor * rocm_exec)94 GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
95   CHECK(rocm_exec != nullptr);
96   return rocm_exec->gpu_context();
97 }
98 
~GpuExecutor()99 GpuExecutor::~GpuExecutor() {
100   for (auto& it : disk_modules_) {
101     GpuDriver::UnloadModule(context_, it.second);
102   }
103   for (auto& it : in_memory_modules_) {
104     GpuDriver::UnloadModule(context_, it.second);
105   }
106   if (context_ != nullptr) {
107     GpuDriver::DestroyContext(context_);
108   }
109   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
110   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
111 }
UnloadModule(ModuleHandle module_handle)112 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
113   const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
114   absl::MutexLock lock{&in_memory_modules_mu_};
115   return UnloadGpuBinary(gpu_binary);
116 }
117 
UnloadGpuBinary(const void * gpu_binary)118 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
119   auto module_it = gpu_binary_to_module_.find(gpu_binary);
120   if (gpu_binary_to_module_.end() == module_it) {
121     VLOG(3) << "No loaded  HSACO module for " << gpu_binary;
122     return false;
123   }
124   auto& module = module_it->second.first;
125   auto& refcount = module_it->second.second;
126   VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
127   if (--refcount == 0) {
128     VLOG(3) << "Unloading  HSACO module " << module;
129     GpuDriver::UnloadModule(context_, module);
130     gpu_binary_to_module_.erase(module_it);
131     const char* mem_it = nullptr;
132     for (auto x : in_memory_modules_) {
133       if (x.second == module) mem_it = x.first;
134     }
135     if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
136   }
137   return true;
138 }
139 
UnloadKernel(const KernelBase * kernel)140 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
141   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
142 
143   absl::MutexLock lock{&in_memory_modules_mu_};
144   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
145   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
146     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
147             << " has never been loaded.";
148     return;  // We've never seen this kernel.
149   }
150   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
151           << " has loaded GPU code " << gpu_binary_it->second;
152   UnloadGpuBinary(gpu_binary_it->second);
153   kernel_to_gpu_binary_.erase(gpu_binary_it);
154 }
155 
Init(int device_ordinal,DeviceOptions device_options)156 port::Status GpuExecutor::Init(int device_ordinal,
157                                DeviceOptions device_options) {
158   device_ordinal_ = device_ordinal;
159 
160   auto status = GpuDriver::Init();
161   if (!status.ok()) {
162     return status;
163   }
164 
165   status = GpuDriver::GetDevice(device_ordinal_, &device_);
166   if (!status.ok()) {
167     return status;
168   }
169 
170   status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
171                                     &context_);
172   if (!status.ok()) {
173     return status;
174   }
175 
176   return GpuDriver::GetGpuISAVersion(&version_, device_);
177 }
178 
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const179 bool GpuExecutor::FindOnDiskForComputeCapability(
180     absl::string_view filename, absl::string_view canonical_suffix,
181     string* found_filename) const {
182   LOG(FATAL) << "Feature not supported on ROCM platform "
183                 "(FindOnDiskForComputeCapability)";
184   return false;
185 }
186 
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const187 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
188                                           absl::string_view canonical_suffix,
189                                           string* found_filename) const {
190   if (version_ == 0) {
191     return false;
192   }
193 
194   string cc_specific =
195       absl::StrCat(filename, ".cc", version_, canonical_suffix);
196   if (port::FileExists(cc_specific).ok()) {
197     VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
198             << cc_specific;
199     *found_filename = cc_specific;
200     return true;
201   }
202 
203   VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
204           << cc_specific;
205   if (port::FileExists(string(filename)).ok()) {
206     *found_filename = string(filename);
207     return true;
208   }
209 
210   return false;
211 }
212 
213 // Returns the path to the running executable.
214 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
215 // Arg: strip_exe: if true, remove the name of the executable itself from the
216 //                 returned string. Example: calling this from /usr/bin/foo
217 //                 would return /usr/bin.
GetBinaryDir(bool strip_exe)218 static string GetBinaryDir(bool strip_exe) {
219   char exe_path[PATH_MAX] = {0};
220   PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
221   // Make sure it's null-terminated:
222   exe_path[sizeof(exe_path) - 1] = 0;
223 
224   if (strip_exe) {
225     // The exe is the last component of the path, so remove one component.
226     string ret = exe_path;
227     std::vector<string> components = absl::StrSplit(exe_path, '/');
228     components.pop_back();
229     return absl::StrJoin(components, "/");
230   }
231   return exe_path;
232 }
233 
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)234 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
235                                     KernelBase* kernel) {
236   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
237   hipModule_t module = nullptr;
238   const string* kernelname;
239 
240   const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
241   bool has_cubin = spec.has_cuda_cubin_on_disk();
242   if (has_cubin) {
243     on_disk_spec = &spec.cuda_cubin_on_disk();
244   }
245 
246   if (on_disk_spec != nullptr) {
247     return port::InternalError(
248         "Loading ROCM kernel from disk is not supported");
249   } else if (spec.has_cuda_cubin_in_memory()) {
250     kernelname = &spec.cuda_cubin_in_memory().kernelname();
251 
252     const char* hsaco = spec.cuda_cubin_in_memory().bytes();
253     absl::MutexLock lock{&in_memory_modules_mu_};
254     module = in_memory_modules_[hsaco];
255 
256     if (module == nullptr) {
257       TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, &module));
258     }
259     kernel_to_gpu_binary_[kernel] = hsaco;
260   } else {
261     return port::InternalError("No method of loading ROCM kernel provided");
262   }
263 
264   VLOG(2) << "getting function " << *kernelname << " from module " << module;
265   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
266                                     rocm_kernel->gpu_function_ptr())) {
267     return port::InternalError("Failed getting module function");
268   }
269 
270   // We have to trust the kernel loader spec arity because there doesn't appear
271   // to be a way to reflect on the number of expected arguments w/the ROCM API.
272   rocm_kernel->set_arity(spec.arity());
273 
274   KernelMetadata kernel_metadata;
275   TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
276   kernel->set_metadata(kernel_metadata);
277   kernel->set_name(*kernelname);
278   return port::Status::OK();
279 }
280 
GetKernelMetadata(GpuKernel * rocm_kernel,KernelMetadata * kernel_metadata)281 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
282                                             KernelMetadata* kernel_metadata) {
283   int value = 0;
284   // TODO(ROCm) implement this feature in HIP
285   kernel_metadata->set_registers_per_thread(value);
286 
287   // TODO(ROCm) implement this feature in HIP
288   kernel_metadata->set_shared_memory_bytes(value);
289   return port::Status::OK();
290 }
291 
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)292 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
293                                  const BlockDim& block_dims,
294                                  const KernelBase& kernel,
295                                  const KernelArgsArrayBase& args) {
296   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
297   GpuStreamHandle hipstream = AsGpuStreamValue(stream);
298   const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
299   hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
300 
301   // Only perform/print the occupancy check once.  Even just checking to see
302   // whether we've done an occupancy check on this kernel before isn't free
303   // (because we have to synchronize), so we only do this at -v 2+.
304   if (VLOG_IS_ON(2)) {
305     absl::MutexLock lock(&launched_kernels_mu_);
306     if (!launched_kernels_.count(hipfunc)) {
307       VlogOccupancyInfo(kernel, thread_dims, block_dims);
308       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
309       // expose a kernel/module deallocation method.
310       launched_kernels_.insert(hipfunc);
311     }
312   }
313 
314   if (rocm_kernel->GetPreferredCacheConfig() !=
315       KernelCacheConfig::kNoPreference) {
316     TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
317         hipfunc, rocm_kernel->GetGpuCacheConfig()));
318   }
319 
320   // prepare kernargs
321   // KernelArgsArrayBase keeps the pointer of arguments
322   // deference them here
323   std::vector<void*> kernargs;
324   KernelArgIterator iter = args.arg_iterator();
325   while (iter.has_next()) {
326     KernelArg arg = iter.next();
327     VLOG(2) << "*(arg.address): "
328             << reinterpret_cast<void*>(
329                    *static_cast<const uint64_t*>(arg.address));
330     kernargs.push_back(
331         reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
332   }
333 
334   size_t size = sizeof(void*) * kernargs.size();
335   void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
336                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
337 
338   return GpuDriver::LaunchKernel(
339       GetGpuContext(stream), kernel.name(), hipfunc, block_dims.x, block_dims.y,
340       block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
341       args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
342 }
343 
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)344 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
345                                     uint64 registers_per_thread,
346                                     uint64 shared_memory_per_block,
347                                     const ThreadDim& thread_dims,
348                                     GpuFunctionHandle func) {
349   LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
350   return 0;
351 }
352 
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)353 int GpuExecutor::CompareOccupancy(int* initial_blocks,
354                                   const DeviceDescription& device_description,
355                                   uint64 registers_per_thread,
356                                   uint64 shared_memory_per_block,
357                                   const ThreadDim& thread_dims,
358                                   GpuFunctionHandle func) {
359   LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
360   return 0;
361 }
362 
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)363 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
364                                      ModuleHandle* module_handle) {
365   // In GpuExecutor we store the pointer to the  HSACO binary  as
366   // ModuleHandle::id().
367   hipModule_t hip_module = nullptr;
368   // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
369   if (spec.has_cuda_cubin_in_memory()) {
370     absl::MutexLock lock{&in_memory_modules_mu_};
371     TF_RETURN_IF_ERROR(LoadModuleFromHsaco(
372         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
373         &hip_module));
374     *module_handle = ModuleHandle(const_cast<void*>(
375         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
376     return port::Status::OK();
377   } else {
378     return port::InternalError("No HASCO binary found");
379   }
380 }
381 
LoadModuleFromCuBin(const char * cubin,hipModule_t * module)382 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
383                                               hipModule_t* module) {
384   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
385 }
386 
LoadModuleFromPtx(const char * ptx,hipModule_t * module)387 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
388                                             hipModule_t* module) {
389   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
390 }
391 
LoadModuleFromHsaco(const char * hsaco,hipModule_t * module)392 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
393                                               hipModule_t* module) {
394   uint64_t module_refcount;
395   std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
396 
397   if (*module == nullptr) {
398     TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, module));
399     module_refcount = 1;
400     in_memory_modules_[hsaco] = *module;
401     VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
402             << " as module " << *module;
403   } else {
404     ++module_refcount;
405     VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
406             << " is already loaded as module " << *module;
407   }
408   gpu_binary_to_module_[hsaco] = {*module, module_refcount};
409   return port::Status::OK();
410 }
411 
412 // This is a non-essential operation; if there's a failure, proceed without
413 // logging an error. It's nearly certain that in case of failures, we'd never
414 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)415 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
416                                     const ThreadDim& thread_dims,
417                                     const BlockDim& block_dims) {
418   // TODO(ROCm) implement this feature in HIP
419 }
420 
Allocate(uint64 size,int64 memory_space)421 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
422   CHECK_EQ(memory_space, 0);
423   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
424 }
425 
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)426 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
427                                 uint64 size_bytes) {
428   // offset and size are in bytes, so char* works as the pointer type.
429   return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
430 }
431 
Deallocate(DeviceMemoryBase * mem)432 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
433   GpuDriver::DeviceDeallocate(context_, mem->opaque());
434 }
435 
HostMemoryRegister(void * location,uint64 size)436 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
437   if (location == nullptr || size == 0) {
438     LOG(WARNING) << "attempting to register null or zero-sized memory: "
439                  << location << "; size " << size;
440   }
441   VLOG(2) << "registering " << location << " size " << size;
442   return GpuDriver::HostRegister(context_, location, size);
443 }
444 
HostMemoryUnregister(void * location)445 bool GpuExecutor::HostMemoryUnregister(void* location) {
446   VLOG(2) << "unregistering " << location;
447   return GpuDriver::HostUnregister(context_, location);
448 }
449 
SynchronizeAllActivity()450 bool GpuExecutor::SynchronizeAllActivity() {
451   return GpuDriver::SynchronizeContext(context_);
452 }
453 
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)454 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
455                                              uint64 size) {
456   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
457       size % 4 == 0) {
458     return GpuDriver::SynchronousMemsetUint32(
459         context_, AsROCmDevicePtr(location), 0x0, size / 4);
460   }
461   return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
462                                            0x0, size);
463 }
464 
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)465 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
466                                             int value, uint64 size) {
467   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
468       size % 4 == 0) {
469     // hipMemset reinterprets "value" as a uint8.
470     uint8 byte_value = static_cast<uint8>(value);
471     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
472                      (byte_value << 8) | byte_value;
473     return GpuDriver::SynchronousMemsetUint32(
474         context_, AsROCmDevicePtr(location), pattern, size / 4);
475   }
476   return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
477                                            value, size);
478 }
479 
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)480 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
481                                             const void* host_src, uint64 size) {
482   return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
483                                          host_src, size);
484 }
485 
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)486 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
487                                             const DeviceMemoryBase& gpu_src,
488                                             uint64 size) {
489   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
490                                          AsROCmDevicePtr(gpu_src), size);
491 }
492 
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)493 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
494     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
495   return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
496                                          AsROCmDevicePtr(gpu_src), size);
497 }
498 
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)499 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
500                                   uint64 size) {
501   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
502       size % 4 == 0) {
503     return Memset32(stream, location, 0x0, size);
504   } else {
505     return Memset(stream, location, 0x0, size);
506   }
507 }
508 
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)509 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
510                                  uint8 pattern, uint64 size) {
511   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
512           << " at location " << location << " with size " << size
513           << " and pattern " << std::hex << pattern;
514   return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
515                                             pattern, size,
516                                             AsGpuStreamValue(stream));
517 }
518 
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)519 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
520                                    uint32 pattern, uint64 size) {
521   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
522           << " at location " << location << " with size " << size
523           << " and pattern " << std::hex << pattern;
524   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
525         size % 4 == 0);
526   return GpuDriver::AsynchronousMemsetUint32(
527       context_, AsROCmDevicePtr(location), pattern, size / 4,
528       AsGpuStreamValue(stream));
529 }
530 
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)531 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
532                          const DeviceMemoryBase& gpu_src, uint64 size) {
533   return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
534                                           AsROCmDevicePtr(gpu_src), size,
535                                           AsGpuStreamValue(stream));
536 }
537 
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)538 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
539                          const void* host_src, uint64 size) {
540   return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
541                                           host_src, size,
542                                           AsGpuStreamValue(stream));
543 }
544 
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)545 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
546                                        DeviceMemoryBase* gpu_dst,
547                                        const DeviceMemoryBase& gpu_src,
548                                        uint64 size) {
549   return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
550                                           AsROCmDevicePtr(gpu_src), size,
551                                           AsGpuStreamValue(stream));
552 }
553 
HostCallback(Stream * stream,std::function<port::Status ()> callback)554 bool GpuExecutor::HostCallback(Stream* stream,
555                                std::function<port::Status()> callback) {
556   auto callback_ptr = new std::function<void()>([callback]() {
557     port::Status s = callback();
558     if (!s.ok()) {
559       LOG(WARNING) << "Host callback failed: " << s;
560     }
561   });
562   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
563                                       InternalHostCallback, callback_ptr);
564 }
565 
InternalHostCallback(GpuStreamHandle stream,hipError_t status,void * data)566 /* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
567                                                     hipError_t status,
568                                                     void* data) {
569   std::function<void()>* callback =
570       reinterpret_cast<std::function<void()>*>(data);
571   (*callback)();
572   delete callback;
573 }
574 
AllocateEvent(Event * event)575 port::Status GpuExecutor::AllocateEvent(Event* event) {
576   return AsGpuEvent(event)->Init();
577 }
578 
DeallocateEvent(Event * event)579 port::Status GpuExecutor::DeallocateEvent(Event* event) {
580   return AsGpuEvent(event)->Destroy();
581 }
582 
RecordEvent(Stream * stream,Event * event)583 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
584   return AsGpuEvent(event)->Record(AsGpuStream(stream));
585 }
586 
WaitForEvent(Stream * stream,Event * event)587 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
588   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
589                                    AsGpuEvent(event)->gpu_event())) {
590     return port::Status::OK();
591   } else {
592     return port::Status{
593         port::error::INTERNAL,
594         absl::StrFormat("error recording waiting for ROCM event on stream %p",
595                         stream)};
596   }
597 }
598 
PollForEventStatus(Event * event)599 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
600   return AsGpuEvent(event)->PollForStatus();
601 }
602 
AllocateStream(Stream * stream)603 bool GpuExecutor::AllocateStream(Stream* stream) {
604   return AsGpuStream(stream)->Init();
605 }
606 
DeallocateStream(Stream * stream)607 void GpuExecutor::DeallocateStream(Stream* stream) {
608   GpuStream* rocm_stream = AsGpuStream(stream);
609   if (!rocm_stream->IsIdle()) {
610     LOG(ERROR) << "Deallocating stream with pending work";
611   }
612   rocm_stream->Destroy();
613 }
614 
AllocateTimer(Timer * timer)615 bool GpuExecutor::AllocateTimer(Timer* timer) {
616   return AsGpuTimer(timer)->Init();
617 }
618 
DeallocateTimer(Timer * timer)619 void GpuExecutor::DeallocateTimer(Timer* timer) {
620   AsGpuTimer(timer)->Destroy();
621 }
622 
CreateStreamDependency(Stream * dependent,Stream * other)623 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
624   GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
625   bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
626                                    AsGpuStreamValue(other))
627                 .ok();
628   if (!ok) {
629     LOG(ERROR) << "failed to record completion event; "
630                   "therefore, failed to create inter-stream dependency";
631     return false;
632   }
633 
634   return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
635                                       other_completed_event);
636 }
637 
StartTimer(Stream * stream,Timer * timer)638 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
639   return AsGpuTimer(timer)->Start(AsGpuStream(stream));
640 }
641 
StopTimer(Stream * stream,Timer * timer)642 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
643   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
644 }
645 
BlockHostUntilDone(Stream * stream)646 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
647   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
648 }
649 
CreateBlas()650 blas::BlasSupport* GpuExecutor::CreateBlas() {
651   PluginRegistry* registry = PluginRegistry::Instance();
652   port::StatusOr<PluginRegistry::BlasFactory> status =
653       registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
654                                                         plugin_config_.blas());
655   if (!status.ok()) {
656     LOG(ERROR) << "Unable to retrieve BLAS factory: "
657                << status.status().error_message();
658     return nullptr;
659   }
660 
661   return status.ValueOrDie()(this);
662 }
663 
CreateDnn()664 dnn::DnnSupport* GpuExecutor::CreateDnn() {
665   PluginRegistry* registry = PluginRegistry::Instance();
666   port::StatusOr<PluginRegistry::DnnFactory> status =
667       registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
668                                                        plugin_config_.dnn());
669   if (!status.ok()) {
670     LOG(ERROR) << "Unable to retrieve DNN factory: "
671                << status.status().error_message();
672     return nullptr;
673   }
674 
675   return status.ValueOrDie()(this);
676 }
677 
CreateFft()678 fft::FftSupport* GpuExecutor::CreateFft() {
679   PluginRegistry* registry = PluginRegistry::Instance();
680   port::StatusOr<PluginRegistry::FftFactory> status =
681       registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
682                                                        plugin_config_.fft());
683   if (!status.ok()) {
684     LOG(ERROR) << "Unable to retrieve FFT factory: "
685                << status.status().error_message();
686     return nullptr;
687   }
688 
689   return status.ValueOrDie()(this);
690 }
691 
CreateRng()692 rng::RngSupport* GpuExecutor::CreateRng() {
693   PluginRegistry* registry = PluginRegistry::Instance();
694   port::StatusOr<PluginRegistry::RngFactory> status =
695       registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
696                                                        plugin_config_.rng());
697   if (!status.ok()) {
698     LOG(ERROR) << "Unable to retrieve RNG factory: "
699                << status.status().error_message();
700     return nullptr;
701   }
702 
703   return status.ValueOrDie()(this);
704 }
705 
706 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const707 bool GpuExecutor::SupportsDnn() const { return true; }
708 
CanEnablePeerAccessTo(StreamExecutorInterface * other)709 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
710   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
711   return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
712 }
713 
EnablePeerAccessTo(StreamExecutorInterface * other)714 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
715   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
716   return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
717 }
718 
DeviceMemoryUsage(int64 * free,int64 * total) const719 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
720   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
721 }
722 
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)723 bool GpuExecutor::GetSymbol(const string& symbol_name,
724                             ModuleHandle module_handle, void** mem,
725                             size_t* bytes) {
726   absl::MutexLock lock{&in_memory_modules_mu_};
727   if (static_cast<bool>(module_handle)) {
728     auto it = gpu_binary_to_module_.find(module_handle.id());
729     CHECK(it != gpu_binary_to_module_.end());
730     if (GpuDriver::GetModuleSymbol(
731             context_, it->second.first, symbol_name.c_str(),
732             reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
733       return true;
734     }
735   }
736 
737   for (auto& it : gpu_binary_to_module_) {
738     if (GpuDriver::GetModuleSymbol(
739             context_, it.second.first, symbol_name.c_str(),
740             reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
741       return true;
742     }
743   }
744 
745   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
746   return false;
747 }
748 
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)749 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
750   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
751   // we use BlockDims to express the dimensions of blocks within a grid
752   // (as opposed to ThreadDim which expresses the dimensions of threads
753   // within a block).
754   int x, y, z;
755   if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
756     return false;
757   }
758 
759   block_dim_limit->x = x;
760   block_dim_limit->y = y;
761   block_dim_limit->z = z;
762   return true;
763 }
764 
SupportsBlas() const765 bool GpuExecutor::SupportsBlas() const { return true; }
766 
SupportsFft() const767 bool GpuExecutor::SupportsFft() const { return true; }
768 
SupportsRng() const769 bool GpuExecutor::SupportsRng() const { return true; }
770 
771 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()772 GpuExecutor::CreateEventImplementation() {
773   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
774 }
775 
776 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()777 GpuExecutor::CreateKernelImplementation() {
778   return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
779 }
780 
781 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()782 GpuExecutor::GetStreamImplementation() {
783   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
784 }
785 
786 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()787 GpuExecutor::GetTimerImplementation() {
788   return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
789 }
790 
GpuContextHack()791 void* GpuExecutor::GpuContextHack() { return context_; }
792 
gpu_context()793 GpuContext* GpuExecutor::gpu_context() { return context_; }
794 
795 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
796 // of SysFS. Returns -1 if it cannot.
797 //
798 // For anything more complicated/prod-focused than this, you'll likely want to
799 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)800 static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
801   // TODO(ROCm) implement this feature in HIP
802   return 1;
803 }
804 
805 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)806 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
807   GpuDeviceHandle device;
808   auto status = GpuDriver::GetDevice(device_ordinal, &device);
809   if (!status.ok()) {
810     return status;
811   }
812 
813   int version;
814   status = GpuDriver::GetGpuISAVersion(&version, device);
815   if (!status.ok()) {
816     return status;
817   }
818 
819   std::string gcn_arch_name;
820   status = GpuDriver::GetGpuGCNArchName(device, &gcn_arch_name);
821   if (!status.ok()) {
822     return status;
823   }
824 
825   internal::DeviceDescriptionBuilder builder;
826 
827   {
828     int driver_version = 0;
829     (void)GpuDriver::GetDriverVersion(&driver_version);
830     string augmented_driver_version = absl::StrFormat(
831         "%d (%s)", driver_version,
832         rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
833             .c_str());
834     builder.set_driver_version(augmented_driver_version);
835   }
836 
837   {
838     string pci_bus_id = GpuDriver::GetPCIBusID(device);
839 
840     // Lower the hex characters to match sysfs.
841     pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
842     builder.set_pci_bus_id(pci_bus_id);
843 
844     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
845     int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
846     builder.set_numa_node(numa_node);
847   }
848 
849   hipDeviceProp_t prop;
850   if (GpuDriver::GetDeviceProperties(&prop, device_ordinal)) {
851     builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
852 
853     ThreadDim thread_dim_limit;
854     thread_dim_limit.x = prop.maxThreadsDim[0];
855     thread_dim_limit.y = prop.maxThreadsDim[1];
856     thread_dim_limit.z = prop.maxThreadsDim[2];
857     builder.set_thread_dim_limit(thread_dim_limit);
858 
859     float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
860     builder.set_clock_rate_ghz(clock_rate_ghz);
861 
862     // mem_bandwidth = 2 * mem_bus_width_in_bytes * mem_clock_rate_in_hz
863     int64 memory_bandwidth = 2 * (int64(prop.memoryBusWidth) / 8) *
864                              (int64(prop.memoryClockRate) * 1000);
865     builder.set_memory_bandwidth(memory_bandwidth);
866   }
867 
868   {
869     bool ecc_enabled = false;
870     (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
871     builder.set_ecc_enabled(ecc_enabled);
872   }
873 
874   {
875     uint64 device_memory_size = -1;
876     (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
877     builder.set_device_memory_size(device_memory_size);
878   }
879 
880   {
881     BlockDim block_dim_limit;
882     FillBlockDimLimit(device, &block_dim_limit);
883     builder.set_block_dim_limit(block_dim_limit);
884   }
885 
886   {
887     string device_name;
888     TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
889     builder.set_name(device_name);
890   }
891 
892   builder.set_platform_version(
893       absl::StrCat("AMDGPU ISA version: ", gcn_arch_name));
894 
895   // TODO(leary) should be a way to query this from the driver, but this is
896   // unlikely to change for us any time soon.
897   builder.set_device_address_bits(64);
898 
899   builder.set_device_vendor("Advanced Micro Devices, Inc");
900   builder.set_rocm_amdgpu_isa_version(version);
901   builder.set_rocm_amdgpu_gcn_arch_name(gcn_arch_name);
902 
903   builder.set_shared_memory_per_core(
904       GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
905   builder.set_shared_memory_per_block(
906       GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
907   builder.set_core_count(
908       GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
909   builder.set_threads_per_core_limit(
910       GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
911   builder.set_registers_per_block_limit(
912       GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
913   builder.set_threads_per_warp(
914       GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
915   builder.set_registers_per_core_limit(64 * 1024);
916 
917   return builder.Build();
918 }
919 
920 }  // namespace gpu
921 
922 }  // namespace stream_executor
923 
924 REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {});
925