1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <unistd.h>
17
18 #include "absl/base/casts.h"
19 #include "absl/strings/ascii.h"
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/str_join.h"
23 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
24 #include "tensorflow/stream_executor/gpu/gpu_event.h"
25 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
26 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
27 #include "tensorflow/stream_executor/gpu/gpu_timer.h"
28 #include "tensorflow/stream_executor/kernel_cache_config.h"
29 #include "tensorflow/stream_executor/lib/env.h"
30 #include "tensorflow/stream_executor/lib/error.h"
31 #include "tensorflow/stream_executor/lib/initialize.h"
32 #include "tensorflow/stream_executor/lib/mathutil.h"
33 #include "tensorflow/stream_executor/lib/numbers.h"
34 #include "tensorflow/stream_executor/lib/path.h"
35 #include "tensorflow/stream_executor/lib/process_state.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/dso_loader.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/port.h"
41 #include "tensorflow/stream_executor/plugin_registry.h"
42 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
43 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
44 #include "tensorflow/stream_executor/stream.h"
45 #include "tensorflow/stream_executor/stream_executor_internal.h"
46 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
47 #include "tensorflow/stream_executor/timer.h"
48
49 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
50 #error \
51 "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
52 #endif
53
54 #ifdef __ROCM_RUNTIME_H__
55 #error \
56 "ROCM runtime being included into ROCM GPU executor; should be driver only."
57 #endif
58
59 namespace stream_executor {
60 namespace gpu {
61
AsGpuEvent(Event * event)62 static GpuEvent* AsGpuEvent(Event* event) {
63 DCHECK(event != nullptr);
64 return static_cast<GpuEvent*>(event->implementation());
65 }
66
67 // Given a platform-independent timer datatype, returns the internal ROCM
68 // platform implementation pointer.
AsGpuTimer(Timer * timer)69 static GpuTimer* AsGpuTimer(Timer* timer) {
70 DCHECK(timer != nullptr);
71 return static_cast<GpuTimer*>(timer->implementation());
72 }
73
74 // Given const GPU memory, returns a librocm device pointer datatype, suitable
75 // for passing directly to librocm APIs.
76 //
77 // N.B. we must lose constness in order to pass a suitable type to the existing
78 // librocm APIs, so the caller should take care to only pass the result of const
79 // GPU memory conversions to librocm functions which will honor constness.
AsROCmDevicePtr(const DeviceMemoryBase & gpu_mem)80 static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
81 return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
82 }
83
84 // See description on const version above.
AsROCmDevicePtr(DeviceMemoryBase * gpu_mem)85 static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
86 return AsROCmDevicePtr(*gpu_mem);
87 }
88
GetGpuContext(Stream * stream)89 static GpuContext* GetGpuContext(Stream* stream) {
90 return static_cast<GpuExecutor*>(stream->parent()->implementation())
91 ->gpu_context();
92 }
93
ExtractGpuContext(GpuExecutor * rocm_exec)94 GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
95 CHECK(rocm_exec != nullptr);
96 return rocm_exec->gpu_context();
97 }
98
ExtractGpuExecutor(StreamExecutor * stream_exec)99 GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
100 return static_cast<GpuExecutor*>(stream_exec->implementation());
101 }
102
~GpuExecutor()103 GpuExecutor::~GpuExecutor() {
104 for (auto& it : disk_modules_) {
105 GpuDriver::UnloadModule(context_, it.second);
106 }
107 for (auto& it : in_memory_modules_) {
108 GpuDriver::UnloadModule(context_, it.second);
109 }
110 if (context_ != nullptr) {
111 GpuDriver::DestroyContext(context_);
112 }
113 CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
114 CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
115 }
UnloadModule(ModuleHandle module_handle)116 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
117 const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
118 absl::MutexLock lock{&in_memory_modules_mu_};
119 return UnloadGpuBinary(gpu_binary);
120 }
121
UnloadGpuBinary(const void * gpu_binary)122 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
123 auto module_it = gpu_binary_to_module_.find(gpu_binary);
124 if (gpu_binary_to_module_.end() == module_it) {
125 VLOG(3) << "No loaded HSACO module for " << gpu_binary;
126 return false;
127 }
128 auto& module = module_it->second.first;
129 auto& refcount = module_it->second.second;
130 VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
131 if (--refcount == 0) {
132 VLOG(3) << "Unloading HSACO module " << module;
133 GpuDriver::UnloadModule(context_, module);
134 gpu_binary_to_module_.erase(module_it);
135 const char* mem_it = nullptr;
136 for (auto x : in_memory_modules_) {
137 if (x.second == module) mem_it = x.first;
138 }
139 if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
140 }
141 return true;
142 }
143
UnloadKernel(const KernelBase * kernel)144 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
145 VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
146
147 absl::MutexLock lock{&in_memory_modules_mu_};
148 auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
149 if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
150 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
151 << " has never been loaded.";
152 return; // We've never seen this kernel.
153 }
154 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
155 << " has loaded GPU code " << gpu_binary_it->second;
156 UnloadGpuBinary(gpu_binary_it->second);
157 kernel_to_gpu_binary_.erase(gpu_binary_it);
158 }
159
Init(int device_ordinal,DeviceOptions device_options)160 port::Status GpuExecutor::Init(int device_ordinal,
161 DeviceOptions device_options) {
162 device_ordinal_ = device_ordinal;
163
164 auto status = GpuDriver::Init();
165 if (!status.ok()) {
166 return status;
167 }
168
169 status = GpuDriver::GetDevice(device_ordinal_, &device_);
170 if (!status.ok()) {
171 return status;
172 }
173
174 status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
175 &context_);
176 if (!status.ok()) {
177 return status;
178 }
179
180 return GpuDriver::GetGpuISAVersion(&version_, device_);
181 }
182
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const183 bool GpuExecutor::FindOnDiskForComputeCapability(
184 absl::string_view filename, absl::string_view canonical_suffix,
185 string* found_filename) const {
186 LOG(FATAL) << "Feature not supported on ROCM platform "
187 "(FindOnDiskForComputeCapability)";
188 return false;
189 }
190
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const191 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
192 absl::string_view canonical_suffix,
193 string* found_filename) const {
194 if (version_ == 0) {
195 return false;
196 }
197
198 string cc_specific =
199 absl::StrCat(filename, ".cc", version_, canonical_suffix);
200 if (port::FileExists(cc_specific).ok()) {
201 VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
202 << cc_specific;
203 *found_filename = cc_specific;
204 return true;
205 }
206
207 VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
208 << cc_specific;
209 if (port::FileExists(string(filename)).ok()) {
210 *found_filename = string(filename);
211 return true;
212 }
213
214 return false;
215 }
216
217 // Returns the path to the running executable.
218 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
219 // Arg: strip_exe: if true, remove the name of the executable itself from the
220 // returned string. Example: calling this from /usr/bin/foo
221 // would return /usr/bin.
GetBinaryDir(bool strip_exe)222 static string GetBinaryDir(bool strip_exe) {
223 char exe_path[PATH_MAX] = {0};
224 PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
225 // Make sure it's null-terminated:
226 exe_path[sizeof(exe_path) - 1] = 0;
227
228 if (strip_exe) {
229 // The exe is the last component of the path, so remove one component.
230 string ret = exe_path;
231 std::vector<string> components = absl::StrSplit(exe_path, '/');
232 components.pop_back();
233 return absl::StrJoin(components, "/");
234 }
235 return exe_path;
236 }
237
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)238 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
239 KernelBase* kernel) {
240 GpuKernel* rocm_kernel = AsGpuKernel(kernel);
241 hipModule_t module = nullptr;
242 const string* kernelname;
243
244 const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
245 bool has_cubin = spec.has_cuda_cubin_on_disk();
246 if (has_cubin) {
247 on_disk_spec = &spec.cuda_cubin_on_disk();
248 }
249
250 if (on_disk_spec != nullptr) {
251 return port::InternalError(
252 "Loading ROCM kernel from disk is not supported");
253 } else if (spec.has_cuda_cubin_in_memory()) {
254 kernelname = &spec.cuda_cubin_in_memory().kernelname();
255
256 const char* hsaco = spec.cuda_cubin_in_memory().bytes();
257 absl::MutexLock lock{&in_memory_modules_mu_};
258 module = in_memory_modules_[hsaco];
259
260 if (module == nullptr) {
261 TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, &module));
262 }
263 kernel_to_gpu_binary_[kernel] = hsaco;
264 } else {
265 return port::InternalError("No method of loading ROCM kernel provided");
266 }
267
268 VLOG(2) << "getting function " << *kernelname << " from module " << module;
269 if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
270 rocm_kernel->gpu_function_ptr())) {
271 return port::InternalError("Failed getting module function");
272 }
273
274 // We have to trust the kernel loader spec arity because there doesn't appear
275 // to be a way to reflect on the number of expected arguments w/the ROCM API.
276 rocm_kernel->set_arity(spec.arity());
277
278 KernelMetadata kernel_metadata;
279 TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
280 kernel->set_metadata(kernel_metadata);
281 kernel->set_name(*kernelname);
282 return port::Status::OK();
283 }
284
GetKernelMetadata(GpuKernel * rocm_kernel,KernelMetadata * kernel_metadata)285 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
286 KernelMetadata* kernel_metadata) {
287 int value = 0;
288 // TODO(ROCm) implement this feature in HIP
289 kernel_metadata->set_registers_per_thread(value);
290
291 // TODO(ROCm) implement this feature in HIP
292 kernel_metadata->set_shared_memory_bytes(value);
293 return port::Status::OK();
294 }
295
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)296 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
297 const BlockDim& block_dims,
298 const KernelBase& kernel,
299 const KernelArgsArrayBase& args) {
300 CHECK_EQ(kernel.Arity(), args.number_of_arguments());
301 GpuStreamHandle hipstream = AsGpuStreamValue(stream);
302 const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
303 hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
304
305 // Only perform/print the occupancy check once. Even just checking to see
306 // whether we've done an occupancy check on this kernel before isn't free
307 // (because we have to synchronize), so we only do this at -v 2+.
308 if (VLOG_IS_ON(2)) {
309 absl::MutexLock lock(&launched_kernels_mu_);
310 if (!launched_kernels_.count(hipfunc)) {
311 VlogOccupancyInfo(kernel, thread_dims, block_dims);
312 // TODO(rspringer): Remove elements from launched_kernels_...if we ever
313 // expose a kernel/module deallocation method.
314 launched_kernels_.insert(hipfunc);
315 }
316 }
317
318 if (rocm_kernel->GetPreferredCacheConfig() !=
319 KernelCacheConfig::kNoPreference) {
320 TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
321 hipfunc, rocm_kernel->GetGpuCacheConfig()));
322 }
323
324 // prepare kernargs
325 // KernelArgsArrayBase keeps the pointer of arguments
326 // deference them here
327 std::vector<void*> kernargs;
328 KernelArgIterator iter = args.arg_iterator();
329 while (iter.has_next()) {
330 KernelArg arg = iter.next();
331 VLOG(2) << "*(arg.address): "
332 << reinterpret_cast<void*>(
333 *static_cast<const uint64_t*>(arg.address));
334 kernargs.push_back(
335 reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
336 }
337
338 size_t size = sizeof(void*) * kernargs.size();
339 void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
340 HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
341
342 return GpuDriver::LaunchKernel(
343 GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y, block_dims.z,
344 thread_dims.x, thread_dims.y, thread_dims.z,
345 args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
346 }
347
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)348 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
349 uint64 registers_per_thread,
350 uint64 shared_memory_per_block,
351 const ThreadDim& thread_dims,
352 GpuFunctionHandle func) {
353 LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
354 return 0;
355 }
356
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)357 int GpuExecutor::CompareOccupancy(int* initial_blocks,
358 const DeviceDescription& device_description,
359 uint64 registers_per_thread,
360 uint64 shared_memory_per_block,
361 const ThreadDim& thread_dims,
362 GpuFunctionHandle func) {
363 LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
364 return 0;
365 }
366
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)367 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
368 ModuleHandle* module_handle) {
369 // In GpuExecutor we store the pointer to the HSACO binary as
370 // ModuleHandle::id().
371 hipModule_t hip_module = nullptr;
372 // TODO(ROCm): Need generic term instead of cubin/cuda/ptx
373 if (spec.has_cuda_cubin_in_memory()) {
374 absl::MutexLock lock{&in_memory_modules_mu_};
375 TF_RETURN_IF_ERROR(LoadModuleFromHsaco(
376 reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
377 &hip_module));
378 *module_handle = ModuleHandle(const_cast<void*>(
379 static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
380 return port::Status::OK();
381 } else {
382 return port::InternalError("No HASCO binary found");
383 }
384 }
385
LoadModuleFromCuBin(const char * cubin,hipModule_t * module)386 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
387 hipModule_t* module) {
388 LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
389 }
390
LoadModuleFromPtx(const char * ptx,hipModule_t * module)391 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
392 hipModule_t* module) {
393 LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
394 }
395
LoadModuleFromHsaco(const char * hsaco,hipModule_t * module)396 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
397 hipModule_t* module) {
398 uint64_t module_refcount;
399 std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
400
401 if (*module == nullptr) {
402 TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, module));
403 module_refcount = 1;
404 in_memory_modules_[hsaco] = *module;
405 VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
406 << " as module " << *module;
407 } else {
408 ++module_refcount;
409 VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
410 << " is already loaded as module " << *module;
411 }
412 gpu_binary_to_module_[hsaco] = {*module, module_refcount};
413 return port::Status::OK();
414 }
415
416 // This is a non-essential operation; if there's a failure, proceed without
417 // logging an error. It's nearly certain that in case of failures, we'd never
418 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)419 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
420 const ThreadDim& thread_dims,
421 const BlockDim& block_dims) {
422 // TODO(ROCm) implement this feature in HIP
423 }
424
Allocate(uint64 size,int64 memory_space)425 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
426 CHECK_EQ(memory_space, 0);
427 return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
428 }
429
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)430 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
431 uint64 size_bytes) {
432 // offset and size are in bytes, so char* works as the pointer type.
433 return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
434 }
435
Deallocate(DeviceMemoryBase * mem)436 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
437 GpuDriver::DeviceDeallocate(context_, mem->opaque());
438 }
439
HostMemoryRegister(void * location,uint64 size)440 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
441 if (location == nullptr || size == 0) {
442 LOG(WARNING) << "attempting to register null or zero-sized memory: "
443 << location << "; size " << size;
444 }
445 VLOG(2) << "registering " << location << " size " << size;
446 return GpuDriver::HostRegister(context_, location, size);
447 }
448
HostMemoryUnregister(void * location)449 bool GpuExecutor::HostMemoryUnregister(void* location) {
450 VLOG(2) << "unregistering " << location;
451 return GpuDriver::HostUnregister(context_, location);
452 }
453
SynchronizeAllActivity()454 bool GpuExecutor::SynchronizeAllActivity() {
455 return GpuDriver::SynchronizeContext(context_);
456 }
457
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)458 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
459 uint64 size) {
460 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
461 size % 4 == 0) {
462 return GpuDriver::SynchronousMemsetUint32(
463 context_, AsROCmDevicePtr(location), 0x0, size / 4);
464 }
465 return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
466 0x0, size);
467 }
468
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)469 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
470 int value, uint64 size) {
471 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
472 size % 4 == 0) {
473 // hipMemset reinterprets "value" as a uint8.
474 uint8 byte_value = static_cast<uint8>(value);
475 uint32 pattern = (byte_value << 24) | (byte_value << 16) |
476 (byte_value << 8) | byte_value;
477 return GpuDriver::SynchronousMemsetUint32(
478 context_, AsROCmDevicePtr(location), pattern, size / 4);
479 }
480 return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
481 value, size);
482 }
483
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)484 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
485 const void* host_src, uint64 size) {
486 return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
487 host_src, size);
488 }
489
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)490 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
491 const DeviceMemoryBase& gpu_src,
492 uint64 size) {
493 return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
494 AsROCmDevicePtr(gpu_src), size);
495 }
496
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)497 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
498 DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
499 return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
500 AsROCmDevicePtr(gpu_src), size);
501 }
502
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)503 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
504 uint64 size) {
505 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
506 size % 4 == 0) {
507 return Memset32(stream, location, 0x0, size);
508 } else {
509 return Memset(stream, location, 0x0, size);
510 }
511 }
512
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)513 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
514 uint8 pattern, uint64 size) {
515 VLOG(2) << "enqueueing memset8 operation onto stream " << stream
516 << " at location " << location << " with size " << size
517 << " and pattern " << std::hex << pattern;
518 return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
519 pattern, size,
520 AsGpuStreamValue(stream));
521 }
522
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)523 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
524 uint32 pattern, uint64 size) {
525 VLOG(2) << "enqueueing memset32 operation onto stream " << stream
526 << " at location " << location << " with size " << size
527 << " and pattern " << std::hex << pattern;
528 CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
529 size % 4 == 0);
530 return GpuDriver::AsynchronousMemsetUint32(
531 context_, AsROCmDevicePtr(location), pattern, size / 4,
532 AsGpuStreamValue(stream));
533 }
534
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)535 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
536 const DeviceMemoryBase& gpu_src, uint64 size) {
537 return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
538 AsROCmDevicePtr(gpu_src), size,
539 AsGpuStreamValue(stream));
540 }
541
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)542 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
543 const void* host_src, uint64 size) {
544 return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
545 host_src, size,
546 AsGpuStreamValue(stream));
547 }
548
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)549 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
550 DeviceMemoryBase* gpu_dst,
551 const DeviceMemoryBase& gpu_src,
552 uint64 size) {
553 return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
554 AsROCmDevicePtr(gpu_src), size,
555 AsGpuStreamValue(stream));
556 }
557
HostCallback(Stream * stream,std::function<port::Status ()> callback)558 bool GpuExecutor::HostCallback(Stream* stream,
559 std::function<port::Status()> callback) {
560 auto callback_ptr = new std::function<void()>([callback]() {
561 port::Status s = callback();
562 if (!s.ok()) {
563 LOG(WARNING) << "Host callback failed: " << s;
564 }
565 });
566 return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
567 InternalHostCallback, callback_ptr);
568 }
569
InternalHostCallback(GpuStreamHandle stream,hipError_t status,void * data)570 /* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
571 hipError_t status,
572 void* data) {
573 std::function<void()>* callback =
574 reinterpret_cast<std::function<void()>*>(data);
575 (*callback)();
576 delete callback;
577 }
578
AllocateEvent(Event * event)579 port::Status GpuExecutor::AllocateEvent(Event* event) {
580 return AsGpuEvent(event)->Init();
581 }
582
DeallocateEvent(Event * event)583 port::Status GpuExecutor::DeallocateEvent(Event* event) {
584 return AsGpuEvent(event)->Destroy();
585 }
586
RecordEvent(Stream * stream,Event * event)587 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
588 return AsGpuEvent(event)->Record(AsGpuStream(stream));
589 }
590
WaitForEvent(Stream * stream,Event * event)591 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
592 if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
593 AsGpuEvent(event)->gpu_event())) {
594 return port::Status::OK();
595 } else {
596 return port::Status{
597 port::error::INTERNAL,
598 absl::StrFormat("error recording waiting for ROCM event on stream %p",
599 stream)};
600 }
601 }
602
PollForEventStatus(Event * event)603 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
604 return AsGpuEvent(event)->PollForStatus();
605 }
606
AllocateStream(Stream * stream)607 bool GpuExecutor::AllocateStream(Stream* stream) {
608 return AsGpuStream(stream)->Init();
609 }
610
DeallocateStream(Stream * stream)611 void GpuExecutor::DeallocateStream(Stream* stream) {
612 GpuStream* rocm_stream = AsGpuStream(stream);
613 if (!rocm_stream->IsIdle()) {
614 LOG(ERROR) << "Deallocating stream with pending work";
615 }
616 rocm_stream->Destroy();
617 }
618
AllocateTimer(Timer * timer)619 bool GpuExecutor::AllocateTimer(Timer* timer) {
620 return AsGpuTimer(timer)->Init();
621 }
622
DeallocateTimer(Timer * timer)623 void GpuExecutor::DeallocateTimer(Timer* timer) {
624 AsGpuTimer(timer)->Destroy();
625 }
626
CreateStreamDependency(Stream * dependent,Stream * other)627 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
628 GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
629 bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
630 AsGpuStreamValue(other))
631 .ok();
632 if (!ok) {
633 LOG(ERROR) << "failed to record completion event; "
634 "therefore, failed to create inter-stream dependency";
635 return false;
636 }
637
638 return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
639 other_completed_event);
640 }
641
StartTimer(Stream * stream,Timer * timer)642 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
643 return AsGpuTimer(timer)->Start(AsGpuStream(stream));
644 }
645
StopTimer(Stream * stream,Timer * timer)646 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
647 return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
648 }
649
BlockHostUntilDone(Stream * stream)650 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
651 return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
652 }
653
CreateBlas()654 blas::BlasSupport* GpuExecutor::CreateBlas() {
655 PluginRegistry* registry = PluginRegistry::Instance();
656 port::StatusOr<PluginRegistry::BlasFactory> status =
657 registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
658 plugin_config_.blas());
659 if (!status.ok()) {
660 LOG(ERROR) << "Unable to retrieve BLAS factory: "
661 << status.status().error_message();
662 return nullptr;
663 }
664
665 return status.ValueOrDie()(this);
666 }
667
CreateDnn()668 dnn::DnnSupport* GpuExecutor::CreateDnn() {
669 PluginRegistry* registry = PluginRegistry::Instance();
670 port::StatusOr<PluginRegistry::DnnFactory> status =
671 registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
672 plugin_config_.dnn());
673 if (!status.ok()) {
674 LOG(ERROR) << "Unable to retrieve DNN factory: "
675 << status.status().error_message();
676 return nullptr;
677 }
678
679 return status.ValueOrDie()(this);
680 }
681
CreateFft()682 fft::FftSupport* GpuExecutor::CreateFft() {
683 PluginRegistry* registry = PluginRegistry::Instance();
684 port::StatusOr<PluginRegistry::FftFactory> status =
685 registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
686 plugin_config_.fft());
687 if (!status.ok()) {
688 LOG(ERROR) << "Unable to retrieve FFT factory: "
689 << status.status().error_message();
690 return nullptr;
691 }
692
693 return status.ValueOrDie()(this);
694 }
695
CreateRng()696 rng::RngSupport* GpuExecutor::CreateRng() {
697 PluginRegistry* registry = PluginRegistry::Instance();
698 port::StatusOr<PluginRegistry::RngFactory> status =
699 registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
700 plugin_config_.rng());
701 if (!status.ok()) {
702 LOG(ERROR) << "Unable to retrieve RNG factory: "
703 << status.status().error_message();
704 return nullptr;
705 }
706
707 return status.ValueOrDie()(this);
708 }
709
710 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const711 bool GpuExecutor::SupportsDnn() const { return true; }
712
CanEnablePeerAccessTo(StreamExecutorInterface * other)713 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
714 GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
715 return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
716 }
717
EnablePeerAccessTo(StreamExecutorInterface * other)718 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
719 GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
720 return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
721 }
722
DeviceMemoryUsage(int64 * free,int64 * total) const723 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
724 return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
725 }
726
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)727 bool GpuExecutor::GetSymbol(const string& symbol_name,
728 ModuleHandle module_handle, void** mem,
729 size_t* bytes) {
730 absl::MutexLock lock{&in_memory_modules_mu_};
731 if (static_cast<bool>(module_handle)) {
732 auto it = gpu_binary_to_module_.find(module_handle.id());
733 CHECK(it != gpu_binary_to_module_.end());
734 if (GpuDriver::GetModuleSymbol(
735 context_, it->second.first, symbol_name.c_str(),
736 reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
737 return true;
738 }
739 }
740
741 for (auto& it : gpu_binary_to_module_) {
742 if (GpuDriver::GetModuleSymbol(
743 context_, it.second.first, symbol_name.c_str(),
744 reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
745 return true;
746 }
747 }
748
749 LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
750 return false;
751 }
752
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)753 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
754 // The BlockDim name is a mismatch against these GRID_DIM_* queries because
755 // we use BlockDims to express the dimensions of blocks within a grid
756 // (as opposed to ThreadDim which expresses the dimensions of threads
757 // within a block).
758 int x, y, z;
759 if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
760 return false;
761 }
762
763 block_dim_limit->x = x;
764 block_dim_limit->y = y;
765 block_dim_limit->z = z;
766 return true;
767 }
768
SupportsBlas() const769 bool GpuExecutor::SupportsBlas() const { return true; }
770
SupportsFft() const771 bool GpuExecutor::SupportsFft() const { return true; }
772
SupportsRng() const773 bool GpuExecutor::SupportsRng() const { return true; }
774
775 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()776 GpuExecutor::CreateEventImplementation() {
777 return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
778 }
779
780 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()781 GpuExecutor::CreateKernelImplementation() {
782 return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
783 }
784
785 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()786 GpuExecutor::GetStreamImplementation() {
787 return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
788 }
789
790 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()791 GpuExecutor::GetTimerImplementation() {
792 return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
793 }
794
GpuContextHack()795 void* GpuExecutor::GpuContextHack() { return context_; }
796
gpu_context()797 GpuContext* GpuExecutor::gpu_context() { return context_; }
798
799 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
800 // of SysFS. Returns -1 if it cannot.
801 //
802 // For anything more complicated/prod-focused than this, you'll likely want to
803 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)804 static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
805 // TODO(ROCm) implement this feature in HIP
806 return 1;
807 }
808
809 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)810 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
811 GpuDeviceHandle device;
812 auto status = GpuDriver::GetDevice(device_ordinal, &device);
813 if (!status.ok()) {
814 return status;
815 }
816
817 int version;
818 status = GpuDriver::GetGpuISAVersion(&version, device);
819 if (!status.ok()) {
820 return status;
821 }
822
823 std::string gcn_arch_name;
824 status = GpuDriver::GetGpuGCNArchName(device, &gcn_arch_name);
825 if (!status.ok()) {
826 return status;
827 }
828
829 internal::DeviceDescriptionBuilder builder;
830
831 {
832 int driver_version = 0;
833 (void)GpuDriver::GetDriverVersion(&driver_version);
834 string augmented_driver_version = absl::StrFormat(
835 "%d (%s)", driver_version,
836 rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
837 .c_str());
838 builder.set_driver_version(augmented_driver_version);
839 }
840
841 {
842 string pci_bus_id = GpuDriver::GetPCIBusID(device);
843
844 // Lower the hex characters to match sysfs.
845 pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
846 builder.set_pci_bus_id(pci_bus_id);
847
848 // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
849 int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
850 builder.set_numa_node(numa_node);
851 }
852
853 hipDeviceProp_t prop;
854 if (GpuDriver::GetDeviceProperties(&prop, device_ordinal)) {
855 builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
856
857 ThreadDim thread_dim_limit;
858 thread_dim_limit.x = prop.maxThreadsDim[0];
859 thread_dim_limit.y = prop.maxThreadsDim[1];
860 thread_dim_limit.z = prop.maxThreadsDim[2];
861 builder.set_thread_dim_limit(thread_dim_limit);
862
863 float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
864 builder.set_clock_rate_ghz(clock_rate_ghz);
865
866 // mem_bandwidth = 2 * mem_bus_width_in_bytes * mem_clock_rate_in_hz
867 int64 memory_bandwidth = 2 * (int64(prop.memoryBusWidth) / 8) *
868 (int64(prop.memoryClockRate) * 1000);
869 builder.set_memory_bandwidth(memory_bandwidth);
870 }
871
872 {
873 bool ecc_enabled = false;
874 (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
875 builder.set_ecc_enabled(ecc_enabled);
876 }
877
878 {
879 uint64 device_memory_size = -1;
880 (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
881 builder.set_device_memory_size(device_memory_size);
882 }
883
884 {
885 BlockDim block_dim_limit;
886 FillBlockDimLimit(device, &block_dim_limit);
887 builder.set_block_dim_limit(block_dim_limit);
888 }
889
890 {
891 string device_name;
892 TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
893 builder.set_name(device_name);
894 }
895
896 builder.set_platform_version(
897 absl::StrCat("AMDGPU ISA version: ", gcn_arch_name));
898
899 // TODO(leary) should be a way to query this from the driver, but this is
900 // unlikely to change for us any time soon.
901 builder.set_device_address_bits(64);
902
903 builder.set_device_vendor("Advanced Micro Devices, Inc");
904 builder.set_rocm_amdgpu_isa_version(version);
905 builder.set_rocm_amdgpu_gcn_arch_name(gcn_arch_name);
906
907 builder.set_shared_memory_per_core(
908 GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
909 builder.set_shared_memory_per_block(
910 GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
911 builder.set_core_count(
912 GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
913 builder.set_threads_per_core_limit(
914 GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
915 builder.set_registers_per_block_limit(
916 GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
917 builder.set_threads_per_warp(
918 GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
919 builder.set_registers_per_core_limit(64 * 1024);
920
921 return builder.Build();
922 }
923
924 } // namespace gpu
925
926 } // namespace stream_executor
927
928 REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {});
929