1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <unistd.h>
17
18 #include "absl/base/casts.h"
19 #include "absl/strings/ascii.h"
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/str_join.h"
23 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
24 #include "tensorflow/stream_executor/gpu/gpu_event.h"
25 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
26 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
27 #include "tensorflow/stream_executor/gpu/gpu_timer.h"
28 #include "tensorflow/stream_executor/kernel_cache_config.h"
29 #include "tensorflow/stream_executor/lib/env.h"
30 #include "tensorflow/stream_executor/lib/error.h"
31 #include "tensorflow/stream_executor/lib/initialize.h"
32 #include "tensorflow/stream_executor/lib/mathutil.h"
33 #include "tensorflow/stream_executor/lib/numbers.h"
34 #include "tensorflow/stream_executor/lib/path.h"
35 #include "tensorflow/stream_executor/lib/process_state.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/dso_loader.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/port.h"
41 #include "tensorflow/stream_executor/plugin_registry.h"
42 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
43 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
44 #include "tensorflow/stream_executor/stream.h"
45 #include "tensorflow/stream_executor/stream_executor_internal.h"
46 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
47 #include "tensorflow/stream_executor/timer.h"
48
49 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
50 #error \
51 "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
52 #endif
53
54 #ifdef __ROCM_RUNTIME_H__
55 #error \
56 "ROCM runtime being included into ROCM GPU executor; should be driver only."
57 #endif
58
59 namespace stream_executor {
60 namespace gpu {
61
AsGpuEvent(Event * event)62 static GpuEvent* AsGpuEvent(Event* event) {
63 DCHECK(event != nullptr);
64 return static_cast<GpuEvent*>(event->implementation());
65 }
66
67 // Given a platform-independent timer datatype, returns the internal ROCM
68 // platform implementation pointer.
AsGpuTimer(Timer * timer)69 static GpuTimer* AsGpuTimer(Timer* timer) {
70 DCHECK(timer != nullptr);
71 return static_cast<GpuTimer*>(timer->implementation());
72 }
73
74 // Given const GPU memory, returns a librocm device pointer datatype, suitable
75 // for passing directly to librocm APIs.
76 //
77 // N.B. we must lose constness in order to pass a suitable type to the existing
78 // librocm APIs, so the caller should take care to only pass the result of const
79 // GPU memory conversions to librocm functions which will honor constness.
AsROCmDevicePtr(const DeviceMemoryBase & gpu_mem)80 static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
81 return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
82 }
83
84 // See description on const version above.
AsROCmDevicePtr(DeviceMemoryBase * gpu_mem)85 static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
86 return AsROCmDevicePtr(*gpu_mem);
87 }
88
GetGpuContext(Stream * stream)89 static GpuContext* GetGpuContext(Stream* stream) {
90 return static_cast<GpuExecutor*>(stream->parent()->implementation())
91 ->gpu_context();
92 }
93
ExtractGpuContext(GpuExecutor * rocm_exec)94 GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
95 CHECK(rocm_exec != nullptr);
96 return rocm_exec->gpu_context();
97 }
98
~GpuExecutor()99 GpuExecutor::~GpuExecutor() {
100 for (auto& it : disk_modules_) {
101 GpuDriver::UnloadModule(context_, it.second);
102 }
103 for (auto& it : in_memory_modules_) {
104 GpuDriver::UnloadModule(context_, it.second);
105 }
106 if (context_ != nullptr) {
107 GpuDriver::DestroyContext(context_);
108 }
109 CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
110 CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
111 }
UnloadModule(ModuleHandle module_handle)112 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
113 const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
114 absl::MutexLock lock{&in_memory_modules_mu_};
115 return UnloadGpuBinary(gpu_binary);
116 }
117
UnloadGpuBinary(const void * gpu_binary)118 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
119 auto module_it = gpu_binary_to_module_.find(gpu_binary);
120 if (gpu_binary_to_module_.end() == module_it) {
121 VLOG(3) << "No loaded HSACO module for " << gpu_binary;
122 return false;
123 }
124 auto& module = module_it->second.first;
125 auto& refcount = module_it->second.second;
126 VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
127 if (--refcount == 0) {
128 VLOG(3) << "Unloading HSACO module " << module;
129 GpuDriver::UnloadModule(context_, module);
130 gpu_binary_to_module_.erase(module_it);
131 const char* mem_it = nullptr;
132 for (auto x : in_memory_modules_) {
133 if (x.second == module) mem_it = x.first;
134 }
135 if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
136 }
137 return true;
138 }
139
UnloadKernel(const KernelBase * kernel)140 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
141 VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
142
143 absl::MutexLock lock{&in_memory_modules_mu_};
144 auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
145 if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
146 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
147 << " has never been loaded.";
148 return; // We've never seen this kernel.
149 }
150 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
151 << " has loaded GPU code " << gpu_binary_it->second;
152 UnloadGpuBinary(gpu_binary_it->second);
153 kernel_to_gpu_binary_.erase(gpu_binary_it);
154 }
155
Init(int device_ordinal,DeviceOptions device_options)156 port::Status GpuExecutor::Init(int device_ordinal,
157 DeviceOptions device_options) {
158 device_ordinal_ = device_ordinal;
159
160 auto status = GpuDriver::Init();
161 if (!status.ok()) {
162 return status;
163 }
164
165 status = GpuDriver::GetDevice(device_ordinal_, &device_);
166 if (!status.ok()) {
167 return status;
168 }
169
170 status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
171 &context_);
172 if (!status.ok()) {
173 return status;
174 }
175
176 return GpuDriver::GetGpuISAVersion(&version_, device_);
177 }
178
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const179 bool GpuExecutor::FindOnDiskForComputeCapability(
180 absl::string_view filename, absl::string_view canonical_suffix,
181 string* found_filename) const {
182 LOG(FATAL) << "Feature not supported on ROCM platform "
183 "(FindOnDiskForComputeCapability)";
184 return false;
185 }
186
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const187 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
188 absl::string_view canonical_suffix,
189 string* found_filename) const {
190 if (version_ == 0) {
191 return false;
192 }
193
194 string cc_specific =
195 absl::StrCat(filename, ".cc", version_, canonical_suffix);
196 if (port::FileExists(cc_specific).ok()) {
197 VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
198 << cc_specific;
199 *found_filename = cc_specific;
200 return true;
201 }
202
203 VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
204 << cc_specific;
205 if (port::FileExists(string(filename)).ok()) {
206 *found_filename = string(filename);
207 return true;
208 }
209
210 return false;
211 }
212
213 // Returns the path to the running executable.
214 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
215 // Arg: strip_exe: if true, remove the name of the executable itself from the
216 // returned string. Example: calling this from /usr/bin/foo
217 // would return /usr/bin.
GetBinaryDir(bool strip_exe)218 static string GetBinaryDir(bool strip_exe) {
219 char exe_path[PATH_MAX] = {0};
220 PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
221 // Make sure it's null-terminated:
222 exe_path[sizeof(exe_path) - 1] = 0;
223
224 if (strip_exe) {
225 // The exe is the last component of the path, so remove one component.
226 string ret = exe_path;
227 std::vector<string> components = absl::StrSplit(exe_path, '/');
228 components.pop_back();
229 return absl::StrJoin(components, "/");
230 }
231 return exe_path;
232 }
233
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)234 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
235 KernelBase* kernel) {
236 GpuKernel* rocm_kernel = AsGpuKernel(kernel);
237 hipModule_t module = nullptr;
238 const string* kernelname;
239
240 const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
241 bool has_cubin = spec.has_cuda_cubin_on_disk();
242 if (has_cubin) {
243 on_disk_spec = &spec.cuda_cubin_on_disk();
244 }
245
246 if (on_disk_spec != nullptr) {
247 return port::InternalError(
248 "Loading ROCM kernel from disk is not supported");
249 } else if (spec.has_cuda_cubin_in_memory()) {
250 kernelname = &spec.cuda_cubin_in_memory().kernelname();
251
252 const char* hsaco = spec.cuda_cubin_in_memory().bytes();
253 absl::MutexLock lock{&in_memory_modules_mu_};
254 module = in_memory_modules_[hsaco];
255
256 if (module == nullptr) {
257 TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, &module));
258 }
259 kernel_to_gpu_binary_[kernel] = hsaco;
260 } else {
261 return port::InternalError("No method of loading ROCM kernel provided");
262 }
263
264 VLOG(2) << "getting function " << *kernelname << " from module " << module;
265 if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
266 rocm_kernel->gpu_function_ptr())) {
267 return port::InternalError("Failed getting module function");
268 }
269
270 // We have to trust the kernel loader spec arity because there doesn't appear
271 // to be a way to reflect on the number of expected arguments w/the ROCM API.
272 rocm_kernel->set_arity(spec.arity());
273
274 KernelMetadata kernel_metadata;
275 TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
276 kernel->set_metadata(kernel_metadata);
277 kernel->set_name(*kernelname);
278 return port::Status::OK();
279 }
280
GetKernelMetadata(GpuKernel * rocm_kernel,KernelMetadata * kernel_metadata)281 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
282 KernelMetadata* kernel_metadata) {
283 int value = 0;
284 // TODO(ROCm) implement this feature in HIP
285 kernel_metadata->set_registers_per_thread(value);
286
287 // TODO(ROCm) implement this feature in HIP
288 kernel_metadata->set_shared_memory_bytes(value);
289 return port::Status::OK();
290 }
291
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)292 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
293 const BlockDim& block_dims,
294 const KernelBase& kernel,
295 const KernelArgsArrayBase& args) {
296 CHECK_EQ(kernel.Arity(), args.number_of_arguments());
297 GpuStreamHandle hipstream = AsGpuStreamValue(stream);
298 const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
299 hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
300
301 // Only perform/print the occupancy check once. Even just checking to see
302 // whether we've done an occupancy check on this kernel before isn't free
303 // (because we have to synchronize), so we only do this at -v 2+.
304 if (VLOG_IS_ON(2)) {
305 absl::MutexLock lock(&launched_kernels_mu_);
306 if (!launched_kernels_.count(hipfunc)) {
307 VlogOccupancyInfo(kernel, thread_dims, block_dims);
308 // TODO(rspringer): Remove elements from launched_kernels_...if we ever
309 // expose a kernel/module deallocation method.
310 launched_kernels_.insert(hipfunc);
311 }
312 }
313
314 if (rocm_kernel->GetPreferredCacheConfig() !=
315 KernelCacheConfig::kNoPreference) {
316 TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
317 hipfunc, rocm_kernel->GetGpuCacheConfig()));
318 }
319
320 // prepare kernargs
321 // KernelArgsArrayBase keeps the pointer of arguments
322 // deference them here
323 std::vector<void*> kernargs;
324 KernelArgIterator iter = args.arg_iterator();
325 while (iter.has_next()) {
326 KernelArg arg = iter.next();
327 VLOG(2) << "*(arg.address): "
328 << reinterpret_cast<void*>(
329 *static_cast<const uint64_t*>(arg.address));
330 kernargs.push_back(
331 reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
332 }
333
334 size_t size = sizeof(void*) * kernargs.size();
335 void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
336 HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
337
338 return GpuDriver::LaunchKernel(
339 GetGpuContext(stream), kernel.name(), hipfunc, block_dims.x, block_dims.y,
340 block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
341 args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
342 }
343
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)344 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
345 uint64 registers_per_thread,
346 uint64 shared_memory_per_block,
347 const ThreadDim& thread_dims,
348 GpuFunctionHandle func) {
349 LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
350 return 0;
351 }
352
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)353 int GpuExecutor::CompareOccupancy(int* initial_blocks,
354 const DeviceDescription& device_description,
355 uint64 registers_per_thread,
356 uint64 shared_memory_per_block,
357 const ThreadDim& thread_dims,
358 GpuFunctionHandle func) {
359 LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
360 return 0;
361 }
362
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)363 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
364 ModuleHandle* module_handle) {
365 // In GpuExecutor we store the pointer to the HSACO binary as
366 // ModuleHandle::id().
367 hipModule_t hip_module = nullptr;
368 // TODO(ROCm): Need generic term instead of cubin/cuda/ptx
369 if (spec.has_cuda_cubin_in_memory()) {
370 absl::MutexLock lock{&in_memory_modules_mu_};
371 TF_RETURN_IF_ERROR(LoadModuleFromHsaco(
372 reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
373 &hip_module));
374 *module_handle = ModuleHandle(const_cast<void*>(
375 static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
376 return port::Status::OK();
377 } else {
378 return port::InternalError("No HASCO binary found");
379 }
380 }
381
LoadModuleFromCuBin(const char * cubin,hipModule_t * module)382 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
383 hipModule_t* module) {
384 LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
385 }
386
LoadModuleFromPtx(const char * ptx,hipModule_t * module)387 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
388 hipModule_t* module) {
389 LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
390 }
391
LoadModuleFromHsaco(const char * hsaco,hipModule_t * module)392 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
393 hipModule_t* module) {
394 uint64_t module_refcount;
395 std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
396
397 if (*module == nullptr) {
398 TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, module));
399 module_refcount = 1;
400 in_memory_modules_[hsaco] = *module;
401 VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
402 << " as module " << *module;
403 } else {
404 ++module_refcount;
405 VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
406 << " is already loaded as module " << *module;
407 }
408 gpu_binary_to_module_[hsaco] = {*module, module_refcount};
409 return port::Status::OK();
410 }
411
412 // This is a non-essential operation; if there's a failure, proceed without
413 // logging an error. It's nearly certain that in case of failures, we'd never
414 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)415 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
416 const ThreadDim& thread_dims,
417 const BlockDim& block_dims) {
418 // TODO(ROCm) implement this feature in HIP
419 }
420
Allocate(uint64 size,int64 memory_space)421 DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
422 CHECK_EQ(memory_space, 0);
423 return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
424 }
425
GetSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)426 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
427 uint64 size_bytes) {
428 // offset and size are in bytes, so char* works as the pointer type.
429 return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
430 }
431
Deallocate(DeviceMemoryBase * mem)432 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
433 GpuDriver::DeviceDeallocate(context_, mem->opaque());
434 }
435
HostMemoryRegister(void * location,uint64 size)436 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
437 if (location == nullptr || size == 0) {
438 LOG(WARNING) << "attempting to register null or zero-sized memory: "
439 << location << "; size " << size;
440 }
441 VLOG(2) << "registering " << location << " size " << size;
442 return GpuDriver::HostRegister(context_, location, size);
443 }
444
HostMemoryUnregister(void * location)445 bool GpuExecutor::HostMemoryUnregister(void* location) {
446 VLOG(2) << "unregistering " << location;
447 return GpuDriver::HostUnregister(context_, location);
448 }
449
SynchronizeAllActivity()450 bool GpuExecutor::SynchronizeAllActivity() {
451 return GpuDriver::SynchronizeContext(context_);
452 }
453
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)454 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
455 uint64 size) {
456 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
457 size % 4 == 0) {
458 return GpuDriver::SynchronousMemsetUint32(
459 context_, AsROCmDevicePtr(location), 0x0, size / 4);
460 }
461 return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
462 0x0, size);
463 }
464
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)465 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
466 int value, uint64 size) {
467 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
468 size % 4 == 0) {
469 // hipMemset reinterprets "value" as a uint8.
470 uint8 byte_value = static_cast<uint8>(value);
471 uint32 pattern = (byte_value << 24) | (byte_value << 16) |
472 (byte_value << 8) | byte_value;
473 return GpuDriver::SynchronousMemsetUint32(
474 context_, AsROCmDevicePtr(location), pattern, size / 4);
475 }
476 return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
477 value, size);
478 }
479
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)480 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
481 const void* host_src, uint64 size) {
482 return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
483 host_src, size);
484 }
485
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)486 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
487 const DeviceMemoryBase& gpu_src,
488 uint64 size) {
489 return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
490 AsROCmDevicePtr(gpu_src), size);
491 }
492
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)493 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
494 DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
495 return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
496 AsROCmDevicePtr(gpu_src), size);
497 }
498
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)499 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
500 uint64 size) {
501 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
502 size % 4 == 0) {
503 return Memset32(stream, location, 0x0, size);
504 } else {
505 return Memset(stream, location, 0x0, size);
506 }
507 }
508
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)509 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
510 uint8 pattern, uint64 size) {
511 VLOG(2) << "enqueueing memset8 operation onto stream " << stream
512 << " at location " << location << " with size " << size
513 << " and pattern " << std::hex << pattern;
514 return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
515 pattern, size,
516 AsGpuStreamValue(stream));
517 }
518
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)519 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
520 uint32 pattern, uint64 size) {
521 VLOG(2) << "enqueueing memset32 operation onto stream " << stream
522 << " at location " << location << " with size " << size
523 << " and pattern " << std::hex << pattern;
524 CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
525 size % 4 == 0);
526 return GpuDriver::AsynchronousMemsetUint32(
527 context_, AsROCmDevicePtr(location), pattern, size / 4,
528 AsGpuStreamValue(stream));
529 }
530
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)531 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
532 const DeviceMemoryBase& gpu_src, uint64 size) {
533 return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
534 AsROCmDevicePtr(gpu_src), size,
535 AsGpuStreamValue(stream));
536 }
537
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)538 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
539 const void* host_src, uint64 size) {
540 return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
541 host_src, size,
542 AsGpuStreamValue(stream));
543 }
544
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)545 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
546 DeviceMemoryBase* gpu_dst,
547 const DeviceMemoryBase& gpu_src,
548 uint64 size) {
549 return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
550 AsROCmDevicePtr(gpu_src), size,
551 AsGpuStreamValue(stream));
552 }
553
HostCallback(Stream * stream,std::function<port::Status ()> callback)554 bool GpuExecutor::HostCallback(Stream* stream,
555 std::function<port::Status()> callback) {
556 auto callback_ptr = new std::function<void()>([callback]() {
557 port::Status s = callback();
558 if (!s.ok()) {
559 LOG(WARNING) << "Host callback failed: " << s;
560 }
561 });
562 return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
563 InternalHostCallback, callback_ptr);
564 }
565
InternalHostCallback(GpuStreamHandle stream,hipError_t status,void * data)566 /* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
567 hipError_t status,
568 void* data) {
569 std::function<void()>* callback =
570 reinterpret_cast<std::function<void()>*>(data);
571 (*callback)();
572 delete callback;
573 }
574
AllocateEvent(Event * event)575 port::Status GpuExecutor::AllocateEvent(Event* event) {
576 return AsGpuEvent(event)->Init();
577 }
578
DeallocateEvent(Event * event)579 port::Status GpuExecutor::DeallocateEvent(Event* event) {
580 return AsGpuEvent(event)->Destroy();
581 }
582
RecordEvent(Stream * stream,Event * event)583 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
584 return AsGpuEvent(event)->Record(AsGpuStream(stream));
585 }
586
WaitForEvent(Stream * stream,Event * event)587 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
588 if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
589 AsGpuEvent(event)->gpu_event())) {
590 return port::Status::OK();
591 } else {
592 return port::Status{
593 port::error::INTERNAL,
594 absl::StrFormat("error recording waiting for ROCM event on stream %p",
595 stream)};
596 }
597 }
598
PollForEventStatus(Event * event)599 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
600 return AsGpuEvent(event)->PollForStatus();
601 }
602
AllocateStream(Stream * stream)603 bool GpuExecutor::AllocateStream(Stream* stream) {
604 return AsGpuStream(stream)->Init();
605 }
606
DeallocateStream(Stream * stream)607 void GpuExecutor::DeallocateStream(Stream* stream) {
608 GpuStream* rocm_stream = AsGpuStream(stream);
609 if (!rocm_stream->IsIdle()) {
610 LOG(ERROR) << "Deallocating stream with pending work";
611 }
612 rocm_stream->Destroy();
613 }
614
AllocateTimer(Timer * timer)615 bool GpuExecutor::AllocateTimer(Timer* timer) {
616 return AsGpuTimer(timer)->Init();
617 }
618
DeallocateTimer(Timer * timer)619 void GpuExecutor::DeallocateTimer(Timer* timer) {
620 AsGpuTimer(timer)->Destroy();
621 }
622
CreateStreamDependency(Stream * dependent,Stream * other)623 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
624 GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
625 bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
626 AsGpuStreamValue(other))
627 .ok();
628 if (!ok) {
629 LOG(ERROR) << "failed to record completion event; "
630 "therefore, failed to create inter-stream dependency";
631 return false;
632 }
633
634 return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
635 other_completed_event);
636 }
637
StartTimer(Stream * stream,Timer * timer)638 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
639 return AsGpuTimer(timer)->Start(AsGpuStream(stream));
640 }
641
StopTimer(Stream * stream,Timer * timer)642 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
643 return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
644 }
645
BlockHostUntilDone(Stream * stream)646 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
647 return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
648 }
649
CreateBlas()650 blas::BlasSupport* GpuExecutor::CreateBlas() {
651 PluginRegistry* registry = PluginRegistry::Instance();
652 port::StatusOr<PluginRegistry::BlasFactory> status =
653 registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
654 plugin_config_.blas());
655 if (!status.ok()) {
656 LOG(ERROR) << "Unable to retrieve BLAS factory: "
657 << status.status().error_message();
658 return nullptr;
659 }
660
661 return status.ValueOrDie()(this);
662 }
663
CreateDnn()664 dnn::DnnSupport* GpuExecutor::CreateDnn() {
665 PluginRegistry* registry = PluginRegistry::Instance();
666 port::StatusOr<PluginRegistry::DnnFactory> status =
667 registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
668 plugin_config_.dnn());
669 if (!status.ok()) {
670 LOG(ERROR) << "Unable to retrieve DNN factory: "
671 << status.status().error_message();
672 return nullptr;
673 }
674
675 return status.ValueOrDie()(this);
676 }
677
CreateFft()678 fft::FftSupport* GpuExecutor::CreateFft() {
679 PluginRegistry* registry = PluginRegistry::Instance();
680 port::StatusOr<PluginRegistry::FftFactory> status =
681 registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
682 plugin_config_.fft());
683 if (!status.ok()) {
684 LOG(ERROR) << "Unable to retrieve FFT factory: "
685 << status.status().error_message();
686 return nullptr;
687 }
688
689 return status.ValueOrDie()(this);
690 }
691
CreateRng()692 rng::RngSupport* GpuExecutor::CreateRng() {
693 PluginRegistry* registry = PluginRegistry::Instance();
694 port::StatusOr<PluginRegistry::RngFactory> status =
695 registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
696 plugin_config_.rng());
697 if (!status.ok()) {
698 LOG(ERROR) << "Unable to retrieve RNG factory: "
699 << status.status().error_message();
700 return nullptr;
701 }
702
703 return status.ValueOrDie()(this);
704 }
705
706 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const707 bool GpuExecutor::SupportsDnn() const { return true; }
708
CanEnablePeerAccessTo(StreamExecutorInterface * other)709 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
710 GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
711 return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
712 }
713
EnablePeerAccessTo(StreamExecutorInterface * other)714 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
715 GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
716 return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
717 }
718
DeviceMemoryUsage(int64 * free,int64 * total) const719 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
720 return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
721 }
722
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)723 bool GpuExecutor::GetSymbol(const string& symbol_name,
724 ModuleHandle module_handle, void** mem,
725 size_t* bytes) {
726 absl::MutexLock lock{&in_memory_modules_mu_};
727 if (static_cast<bool>(module_handle)) {
728 auto it = gpu_binary_to_module_.find(module_handle.id());
729 CHECK(it != gpu_binary_to_module_.end());
730 if (GpuDriver::GetModuleSymbol(
731 context_, it->second.first, symbol_name.c_str(),
732 reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
733 return true;
734 }
735 }
736
737 for (auto& it : gpu_binary_to_module_) {
738 if (GpuDriver::GetModuleSymbol(
739 context_, it.second.first, symbol_name.c_str(),
740 reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
741 return true;
742 }
743 }
744
745 LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
746 return false;
747 }
748
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)749 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
750 // The BlockDim name is a mismatch against these GRID_DIM_* queries because
751 // we use BlockDims to express the dimensions of blocks within a grid
752 // (as opposed to ThreadDim which expresses the dimensions of threads
753 // within a block).
754 int x, y, z;
755 if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
756 return false;
757 }
758
759 block_dim_limit->x = x;
760 block_dim_limit->y = y;
761 block_dim_limit->z = z;
762 return true;
763 }
764
SupportsBlas() const765 bool GpuExecutor::SupportsBlas() const { return true; }
766
SupportsFft() const767 bool GpuExecutor::SupportsFft() const { return true; }
768
SupportsRng() const769 bool GpuExecutor::SupportsRng() const { return true; }
770
771 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()772 GpuExecutor::CreateEventImplementation() {
773 return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
774 }
775
776 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()777 GpuExecutor::CreateKernelImplementation() {
778 return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
779 }
780
781 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()782 GpuExecutor::GetStreamImplementation() {
783 return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
784 }
785
786 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()787 GpuExecutor::GetTimerImplementation() {
788 return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
789 }
790
GpuContextHack()791 void* GpuExecutor::GpuContextHack() { return context_; }
792
gpu_context()793 GpuContext* GpuExecutor::gpu_context() { return context_; }
794
795 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
796 // of SysFS. Returns -1 if it cannot.
797 //
798 // For anything more complicated/prod-focused than this, you'll likely want to
799 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)800 static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
801 // TODO(ROCm) implement this feature in HIP
802 return 1;
803 }
804
805 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)806 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
807 GpuDeviceHandle device;
808 auto status = GpuDriver::GetDevice(device_ordinal, &device);
809 if (!status.ok()) {
810 return status;
811 }
812
813 int version;
814 status = GpuDriver::GetGpuISAVersion(&version, device);
815 if (!status.ok()) {
816 return status;
817 }
818
819 std::string gcn_arch_name;
820 status = GpuDriver::GetGpuGCNArchName(device, &gcn_arch_name);
821 if (!status.ok()) {
822 return status;
823 }
824
825 internal::DeviceDescriptionBuilder builder;
826
827 {
828 int driver_version = 0;
829 (void)GpuDriver::GetDriverVersion(&driver_version);
830 string augmented_driver_version = absl::StrFormat(
831 "%d (%s)", driver_version,
832 rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
833 .c_str());
834 builder.set_driver_version(augmented_driver_version);
835 }
836
837 {
838 string pci_bus_id = GpuDriver::GetPCIBusID(device);
839
840 // Lower the hex characters to match sysfs.
841 pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
842 builder.set_pci_bus_id(pci_bus_id);
843
844 // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
845 int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
846 builder.set_numa_node(numa_node);
847 }
848
849 hipDeviceProp_t prop;
850 if (GpuDriver::GetDeviceProperties(&prop, device_ordinal)) {
851 builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
852
853 ThreadDim thread_dim_limit;
854 thread_dim_limit.x = prop.maxThreadsDim[0];
855 thread_dim_limit.y = prop.maxThreadsDim[1];
856 thread_dim_limit.z = prop.maxThreadsDim[2];
857 builder.set_thread_dim_limit(thread_dim_limit);
858
859 float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
860 builder.set_clock_rate_ghz(clock_rate_ghz);
861
862 // mem_bandwidth = 2 * mem_bus_width_in_bytes * mem_clock_rate_in_hz
863 int64 memory_bandwidth = 2 * (int64(prop.memoryBusWidth) / 8) *
864 (int64(prop.memoryClockRate) * 1000);
865 builder.set_memory_bandwidth(memory_bandwidth);
866 }
867
868 {
869 bool ecc_enabled = false;
870 (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
871 builder.set_ecc_enabled(ecc_enabled);
872 }
873
874 {
875 uint64 device_memory_size = -1;
876 (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
877 builder.set_device_memory_size(device_memory_size);
878 }
879
880 {
881 BlockDim block_dim_limit;
882 FillBlockDimLimit(device, &block_dim_limit);
883 builder.set_block_dim_limit(block_dim_limit);
884 }
885
886 {
887 string device_name;
888 TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
889 builder.set_name(device_name);
890 }
891
892 builder.set_platform_version(
893 absl::StrCat("AMDGPU ISA version: ", gcn_arch_name));
894
895 // TODO(leary) should be a way to query this from the driver, but this is
896 // unlikely to change for us any time soon.
897 builder.set_device_address_bits(64);
898
899 builder.set_device_vendor("Advanced Micro Devices, Inc");
900 builder.set_rocm_amdgpu_isa_version(version);
901 builder.set_rocm_amdgpu_gcn_arch_name(gcn_arch_name);
902
903 builder.set_shared_memory_per_core(
904 GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
905 builder.set_shared_memory_per_block(
906 GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
907 builder.set_core_count(
908 GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
909 builder.set_threads_per_core_limit(
910 GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
911 builder.set_registers_per_block_limit(
912 GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
913 builder.set_threads_per_warp(
914 GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
915 builder.set_registers_per_core_limit(64 * 1024);
916
917 return builder.Build();
918 }
919
920 } // namespace gpu
921
922 } // namespace stream_executor
923
924 REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {});
925