1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <stdint.h>
17 #include <stdlib.h>
18
19 #include <map>
20 #include <set>
21 #include <utility>
22
23 #include "absl/base/casts.h"
24 #include "absl/container/inlined_vector.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/synchronization/mutex.h"
28 #include "absl/synchronization/notification.h"
29 #include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
30 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
31 #include "tensorflow/stream_executor/lib/env.h"
32 #include "tensorflow/stream_executor/lib/error.h"
33 #include "tensorflow/stream_executor/lib/human_readable.h"
34 #include "tensorflow/stream_executor/lib/stacktrace.h"
35 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
36 #include "tensorflow/stream_executor/lib/threadpool.h"
37 #include "tensorflow/stream_executor/platform/logging.h"
38 #include "tensorflow/stream_executor/platform/port.h"
39 #include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
40
41 bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
42 bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
43 bool FLAGS_gpuexec_rocm_device_0_only = false;
44
45 #define RETURN_IF_ROCM_ERROR(expr, ...) \
46 do { \
47 hipError_t _res = (expr); \
48 if (TF_PREDICT_FALSE(_res != hipSuccess)) { \
49 return port::InternalError(absl::StrCat( \
50 __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
51 } \
52 } while (0)
53
54 // Debugging: on each push and pop of a rocm context, verify the current device
55 // matches the expected one.
56 constexpr bool kVerifyGpuContext = false;
57
58 namespace stream_executor {
59 namespace gpu {
60
61 // GpuContext wraps the device_ordinal.
62 // Only reason we need this wrapper class is to make the GpuDriver* API
63 class GpuContext {
64 public:
GpuContext(const int v)65 GpuContext(const int v) : device_ordinal_(v) {}
66
device_ordinal() const67 int device_ordinal() const { return device_ordinal_; }
68
69 // Disallow copying and moving.
70 GpuContext(GpuContext&&) = delete;
71 GpuContext(const GpuContext&) = delete;
72 GpuContext& operator=(GpuContext&&) = delete;
73 GpuContext& operator=(const GpuContext&) = delete;
74
75 private:
76 const int device_ordinal_;
77 };
78
79 namespace {
80
81 // Formats hipError_t to output prettified values into a log stream.
82 // Error summaries taken from:
ToString(hipError_t result)83 string ToString(hipError_t result) {
84 #define OSTREAM_ROCM_ERROR(__name) \
85 case hipError##__name: \
86 return "HIP_ERROR_" #__name;
87
88 switch (result) {
89 OSTREAM_ROCM_ERROR(InvalidValue)
90 OSTREAM_ROCM_ERROR(OutOfMemory)
91 OSTREAM_ROCM_ERROR(NotInitialized)
92 OSTREAM_ROCM_ERROR(Deinitialized)
93 OSTREAM_ROCM_ERROR(NoDevice)
94 OSTREAM_ROCM_ERROR(InvalidDevice)
95 OSTREAM_ROCM_ERROR(InvalidImage)
96 OSTREAM_ROCM_ERROR(InvalidContext)
97 OSTREAM_ROCM_ERROR(InvalidHandle)
98 OSTREAM_ROCM_ERROR(NotFound)
99 OSTREAM_ROCM_ERROR(NotReady)
100 OSTREAM_ROCM_ERROR(NoBinaryForGpu)
101
102 // Encountered an uncorrectable ECC error during execution.
103 OSTREAM_ROCM_ERROR(ECCNotCorrectable)
104
105 // Load/store on an invalid address. Must reboot all context.
106 case 700:
107 return "ROCM_ERROR_ILLEGAL_ADDRESS";
108 // Passed too many / wrong arguments, too many threads for register count.
109 case 701:
110 return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
111
112 OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
113 OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
114 OSTREAM_ROCM_ERROR(Unknown) // Unknown internal error to ROCM.
115 default:
116 return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
117 }
118 }
119
120 // ROCM driver routines may require a large amount of stack (particularly
121 // hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
122 // stack-limited threads (such as those spawned by a default-argument
123 // thread::ThreadPool on some platforms), we run certain routines in this pool
124 // and wait for completion.
GetDriverExecutor()125 port::ThreadPool* GetDriverExecutor() {
126 static port::ThreadPool* thread_pool = new port::ThreadPool(
127 port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
128 return thread_pool;
129 }
130
131 } // namespace
132
MemorySpaceString(MemorySpace memory_space)133 string MemorySpaceString(MemorySpace memory_space) {
134 switch (memory_space) {
135 case MemorySpace::kHost:
136 return "host";
137 case MemorySpace::kDevice:
138 return "device";
139 default:
140 LOG(FATAL) << "impossible memory space";
141 }
142 }
143
144 // Returns the current device set in HIP. This is done by calling the
145 // HIP driver (e.g., this value is not our cached view of the current device).
CurrentDeviceOrDie()146 static int CurrentDeviceOrDie() {
147 int current = -1;
148 hipError_t result = tensorflow::wrap::hipGetDevice(¤t);
149 if (result != hipSuccess) {
150 LOG(FATAL) << "failed to query current device: " << ToString(result);
151 }
152 return current;
153 }
154
155 namespace {
156
157 // Call hipDeviceSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()158 void SynchronizeOrDie() {
159 auto res = tensorflow::wrap::hipDeviceSynchronize();
160 if (res != hipSuccess) {
161 LOG(FATAL) << "Synchronize found " << ToString(res)
162 << " :: " << port::CurrentStackTrace();
163 }
164 }
165
166 struct ThreadLocalData {
167 int current_device_ordinal;
168 int depth;
169 };
170
171 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
172
173 } // namespace
174
ScopedActivateContext(GpuContext * context)175 ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
176 if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
177 SynchronizeOrDie();
178 }
179
180 auto* tls = &tls_data.get();
181 if (tls->depth == 0) {
182 tls->current_device_ordinal = CurrentDeviceOrDie();
183 }
184
185 if (kVerifyGpuContext) {
186 CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
187 }
188
189 tls->depth++;
190
191 to_restore_ = context;
192
193 if (context->device_ordinal() == tls->current_device_ordinal) {
194 DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
195 return;
196 }
197
198 VLOG(3) << "ScopedActivateContext switching device from "
199 << tls->current_device_ordinal << " to " << context->device_ordinal();
200
201 // Set the device and update thread local.
202 CHECK_EQ(hipSuccess,
203 tensorflow::wrap::hipSetDevice(context->device_ordinal()));
204 tls->current_device_ordinal = context->device_ordinal();
205 }
206
~ScopedActivateContext()207 ScopedActivateContext::~ScopedActivateContext() {
208 if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
209 SynchronizeOrDie();
210 }
211
212 auto* tls = &tls_data.get();
213
214 if (kVerifyGpuContext) {
215 CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
216 }
217
218 tls->depth--;
219 DCHECK_GE(tls->depth, 0);
220
221 if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
222 DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
223 return;
224 }
225
226 VLOG(3) << "ScopedActivateContext switching device from "
227 << tls->current_device_ordinal << " to "
228 << to_restore_->device_ordinal();
229
230 // Set context and update thread local.
231 CHECK_EQ(hipSuccess,
232 tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
233 tls->current_device_ordinal = to_restore_->device_ordinal();
234 }
235
236 namespace {
237
238 // Returns a stringified device number associated with pointer, primarily for
239 // logging purposes. Returns "?" if the device could not be successfully
240 // queried.
ROCMPointerToDeviceString(hipDeviceptr_t pointer)241 string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
242 auto value = GpuDriver::GetPointerDevice(pointer);
243 if (value.ok()) {
244 return absl::StrCat(value.ValueOrDie());
245 }
246 LOG(ERROR) << "could not query device: " << value.status();
247 return "?";
248 }
249
250 // Returns a stringified memory space associated with pointer, primarily for
251 // logging purposes. Returns "?" if the memory space could not be successfully
252 // queried.
ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer)253 string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
254 auto value = GpuDriver::GetPointerMemorySpace(pointer);
255 if (value.ok()) {
256 return MemorySpaceString(value.ValueOrDie());
257 }
258 LOG(ERROR) << "could not query device: " << value.status();
259 return "?";
260 }
261
262 // Returns a stringified representation of whether or not peer access is
263 // permitted between the "from" and "to" pointers' associated contexts,
264 // primarily for logging purposes. Returns "error" if an error is encountered
265 // in the process of querying.
ROCMPointersToCanAccessString(hipDeviceptr_t from,hipDeviceptr_t to)266 string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
267 hipPointerAttribute_t from_pointerAttributes;
268 hipError_t result =
269 tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
270 if (result != hipSuccess) {
271 LOG(ERROR) << "could not retrieve source pointer's device: "
272 << ToString(result);
273 return "error";
274 }
275
276 hipPointerAttribute_t to_pointerAttributes;
277 result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
278 if (result != hipSuccess) {
279 LOG(ERROR) << "could not retrieve destination pointer's device: "
280 << ToString(result);
281 return "error";
282 }
283
284 GpuContext fromCtx(from_pointerAttributes.device);
285 GpuContext toCtx(to_pointerAttributes.device);
286
287 return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
288 }
289
290 // Actually performs the work of ROCM initialization. Wrapped up in one-time
291 // execution guard.
InternalInit()292 static port::Status InternalInit() {
293 hipError_t res = hipErrorNoDevice;
294 if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
295 LOG(ERROR) << "injecting ROCM init error; initialization will fail";
296 } else {
297 res = tensorflow::wrap::hipInit(0 /* = flags */);
298 }
299
300 if (res == hipSuccess) {
301 return port::Status::OK();
302 }
303
304 LOG(ERROR) << "failed call to hipInit: " << ToString(res);
305 Diagnostician::LogDiagnosticInformation();
306 return port::Status{port::error::ABORTED,
307 absl::StrCat("failed call to hipInit: ", ToString(res))};
308 }
309
310 } // namespace
311
Init()312 /* static */ port::Status GpuDriver::Init() {
313 // Cached return value from calling InternalInit(), as hipInit need only be
314 // called once, but GpuDriver::Init may be called many times.
315 static port::Status* init_retval = [] {
316 return new port::Status(InternalInit());
317 }();
318 return *init_retval;
319 }
320
GetDevice(int device_ordinal,hipDevice_t * device)321 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
322 hipDevice_t* device) {
323 hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
324 if (res == hipSuccess) {
325 return port::Status::OK();
326 }
327
328 return port::Status{
329 port::error::INTERNAL,
330 absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
331 }
332
GetDeviceName(hipDevice_t device,string * device_name)333 /* static */ port::Status GpuDriver::GetDeviceName(hipDevice_t device,
334 string* device_name) {
335 static const size_t kCharLimit = 64;
336 absl::InlinedVector<char, 4> chars(kCharLimit);
337 RETURN_IF_ROCM_ERROR(
338 tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device),
339 "Failed to get device name");
340 chars[kCharLimit - 1] = '\0';
341 *device_name = chars.begin();
342 return port::Status::OK();
343 }
344
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)345 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
346 int* flags) {
347 static_assert(DeviceOptions::kMask == 0xf,
348 "needs update for new device options");
349 return true;
350 }
351
CreateContext(int device_ordinal,hipDevice_t device,const DeviceOptions & device_options,GpuContext ** context)352 /* static */ port::Status GpuDriver::CreateContext(
353 int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
354 GpuContext** context) {
355 *context = new GpuContext(device_ordinal);
356 return port::Status::OK();
357 }
DestroyContext(GpuContext * context)358 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
359 if (context == nullptr) {
360 return;
361 }
362 delete context;
363 }
364
FuncGetAttribute(hipDeviceAttribute_t attribute,hipFunction_t func,int * attribute_value)365 /* static */ port::Status GpuDriver::FuncGetAttribute(
366 hipDeviceAttribute_t attribute, hipFunction_t func, int* attribute_value) {
367 // TODO(ROCm) properly implement this feature in HIP
368 return port::Status::OK();
369 }
370
FuncSetCacheConfig(hipFunction_t function,hipFuncCache_t cache_config)371 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
372 hipFunction_t function, hipFuncCache_t cache_config) {
373 RETURN_IF_ROCM_ERROR(
374 tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config),
375 "Failed to set ROCM kernel cache config.");
376 return port::Status::OK();
377 }
378
379 /* static */ port::StatusOr<hipSharedMemConfig>
ContextGetSharedMemConfig(GpuContext * context)380 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
381 hipSharedMemConfig shared_mem_config;
382 ScopedActivateContext activation{context};
383 RETURN_IF_ROCM_ERROR(
384 tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config),
385 "Failed to get shared memory config");
386 return shared_mem_config;
387 }
388
ContextSetSharedMemConfig(GpuContext * context,hipSharedMemConfig shared_mem_config)389 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
390 GpuContext* context, hipSharedMemConfig shared_mem_config) {
391 ScopedActivateContext activation{context};
392 RETURN_IF_ROCM_ERROR(
393 tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config),
394 "Failed to set ROCM device shared memory config");
395 return port::Status::OK();
396 }
397
LaunchKernel(GpuContext * context,hipFunction_t function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,GpuStreamHandle stream,void ** kernel_params,void ** extra)398 /* static */ port::Status GpuDriver::LaunchKernel(
399 GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
400 unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
401 unsigned int block_dim_y, unsigned int block_dim_z,
402 unsigned int shared_mem_bytes, GpuStreamHandle stream, void** kernel_params,
403 void** extra) {
404 ScopedActivateContext activation{context};
405 VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
406 << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
407 << " bdx: " << block_dim_x << " bdy: " << block_dim_y
408 << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
409 RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipModuleLaunchKernel(
410 function, grid_dim_x, grid_dim_y, grid_dim_z,
411 block_dim_x, block_dim_y, block_dim_z,
412 shared_mem_bytes, stream, kernel_params, extra),
413 "Failed to launch ROCM kernel");
414 VLOG(2) << "successfully launched kernel";
415 return port::Status::OK();
416 }
417
LoadPtx(GpuContext * context,const char * ptx_contents,hipModule_t * module)418 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
419 const char* ptx_contents,
420 hipModule_t* module) {
421 LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
422 return port::InternalError("Not Implemented");
423 }
424
LoadCubin(GpuContext * context,const char * cubin_bytes,hipModule_t * module)425 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
426 const char* cubin_bytes,
427 hipModule_t* module) {
428 return port::Status{port::error::INTERNAL,
429 "Feature not supported on ROCm platform (LoadCubin)"};
430 }
431
LoadHsaco(GpuContext * context,const char * hsaco_contents,hipModule_t * module)432 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
433 const char* hsaco_contents,
434 hipModule_t* module) {
435 absl::Notification notification;
436 port::Status ret = port::Status::OK();
437 GetDriverExecutor()->Schedule([context, hsaco_contents, module, &ret,
438 ¬ification]() {
439 ScopedActivateContext activation{context};
440 void* hsaco_data = const_cast<char*>(hsaco_contents);
441
442 hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
443
444 if (res != hipSuccess) {
445 ret = port::InternalError(
446 absl::StrCat("Failed to load HSACO: ", ToString(res)));
447 notification.Notify();
448 }
449
450 CHECK(module != nullptr);
451 notification.Notify();
452 });
453 notification.WaitForNotification();
454
455 return ret;
456 }
457
SynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t size)458 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
459 GpuContext* context, hipDeviceptr_t location, uint8 value, size_t size) {
460 ScopedActivateContext activation{context};
461 RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipMemsetD8(location, value, size),
462 "Failed to memset memory");
463 return port::Status::OK();
464 }
465
SynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count)466 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
467 GpuContext* context, hipDeviceptr_t location, uint32 value,
468 size_t uint32_count) {
469 ScopedActivateContext activation{context};
470 void* pointer = absl::bit_cast<void*>(location);
471 RETURN_IF_ROCM_ERROR(
472 tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count),
473 "Failed to memset memory");
474 return port::Status::OK();
475 }
476
AsynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t uint32_count,GpuStreamHandle stream)477 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
478 GpuContext* context, hipDeviceptr_t location, uint8 value,
479 size_t uint32_count, GpuStreamHandle stream) {
480 ScopedActivateContext activation{context};
481 RETURN_IF_ROCM_ERROR(
482 tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream),
483 "Failed to enqueue async memset operation");
484 return port::Status::OK();
485 }
486
AsynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count,GpuStreamHandle stream)487 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
488 GpuContext* context, hipDeviceptr_t location, uint32 value,
489 size_t uint32_count, GpuStreamHandle stream) {
490 ScopedActivateContext activation{context};
491 void* pointer = absl::bit_cast<void*>(location);
492 RETURN_IF_ROCM_ERROR(
493 tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream),
494 "Failed to enqueue async memset operation");
495 VLOG(2) << "successfully enqueued async memset operation";
496 return port::Status::OK();
497 }
498
AddStreamCallback(GpuContext * context,GpuStreamHandle stream,StreamCallback callback,void * data)499 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
500 GpuStreamHandle stream,
501 StreamCallback callback,
502 void* data) {
503 hipError_t res = tensorflow::wrap::hipStreamAddCallback(
504 stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
505 if (res != hipSuccess) {
506 LOG(ERROR) << "unable to add host callback: " << ToString(res);
507 return false;
508 }
509 return true;
510 }
511
GetModuleFunction(GpuContext * context,hipModule_t module,const char * kernel_name,hipFunction_t * function)512 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
513 hipModule_t module,
514 const char* kernel_name,
515 hipFunction_t* function) {
516 ScopedActivateContext activated{context};
517 CHECK(module != nullptr && kernel_name != nullptr);
518 hipError_t res =
519 tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
520 if (res != hipSuccess) {
521 LOG(ERROR) << "failed to get kernel \"" << kernel_name
522 << "\" from module: " << ToString(res);
523 return false;
524 }
525
526 return true;
527 }
528
GetModuleSymbol(GpuContext * context,hipModule_t module,const char * symbol_name,hipDeviceptr_t * dptr,size_t * bytes)529 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
530 hipModule_t module,
531 const char* symbol_name,
532 hipDeviceptr_t* dptr,
533 size_t* bytes) {
534 ScopedActivateContext activated{context};
535 CHECK(module != nullptr && symbol_name != nullptr &&
536 (dptr != nullptr || bytes != nullptr));
537 hipError_t res =
538 tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
539 if (res != hipSuccess) {
540 // symbol may not be found in the current module, but it may reside in
541 // another module.
542 VLOG(2) << "failed to get symbol \"" << symbol_name
543 << "\" from module: " << ToString(res);
544 return false;
545 }
546
547 return true;
548 }
549
UnloadModule(GpuContext * context,hipModule_t module)550 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
551 hipModule_t module) {
552 ScopedActivateContext activated{context};
553 hipError_t res = tensorflow::wrap::hipModuleUnload(module);
554 if (res != hipSuccess) {
555 LOG(ERROR) << "failed to unload module " << module
556 << "; leaking: " << ToString(res);
557 }
558 }
559
CreateStream(GpuContext * context,GpuStreamHandle * stream,int priority)560 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
561 GpuStreamHandle* stream,
562 int priority) {
563 ScopedActivateContext activated{context};
564 hipError_t res;
565 if (priority == 0) {
566 res = tensorflow::wrap::hipStreamCreateWithFlags(
567 stream, hipStreamDefault); // switch to hipStreamNonBlocking?
568 } else {
569 res = tensorflow::wrap::hipStreamCreateWithPriority(
570 stream, hipStreamDefault, priority); // switch to hipStreamNonBlocking?
571 }
572 if (res != hipSuccess) {
573 LOG(ERROR) << "could not allocate ROCM stream for device "
574 << context->device_ordinal() << ": " << ToString(res);
575 return false;
576 }
577
578 VLOG(2) << "successfully created stream " << *stream << " for device "
579 << context->device_ordinal() << " on thread";
580 return true;
581 }
582
DestroyStream(GpuContext * context,GpuStreamHandle * stream)583 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
584 GpuStreamHandle* stream) {
585 if (*stream == nullptr) {
586 return;
587 }
588
589 ScopedActivateContext activated{context};
590 hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
591 if (res != hipSuccess) {
592 LOG(ERROR) << "failed to destroy ROCM stream for device "
593 << context->device_ordinal() << ": " << ToString(res);
594 } else {
595 VLOG(2) << "successfully destroyed stream " << *stream << " for device "
596 << context->device_ordinal();
597 *stream = nullptr;
598 }
599 }
600
DeviceAllocate(GpuContext * context,uint64 bytes)601 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
602 uint64 bytes) {
603 ScopedActivateContext activated{context};
604 hipDeviceptr_t result = 0;
605 hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
606 if (res != hipSuccess) {
607 LOG(ERROR) << "failed to allocate "
608 << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
609 << " bytes) from device: " << ToString(res);
610 return nullptr;
611 }
612 void* ptr = reinterpret_cast<void*>(result);
613 VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
614 << " of " << bytes << " bytes";
615 return ptr;
616 }
617
DeviceDeallocate(GpuContext * context,void * location)618 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
619 void* location) {
620 ScopedActivateContext activation{context};
621 hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
622 hipError_t res = tensorflow::wrap::hipFree(pointer);
623 if (res != hipSuccess) {
624 LOG(ERROR) << "failed to free device memory at " << location
625 << "; result: " << ToString(res);
626 } else {
627 VLOG(2) << "deallocated " << location << " for device "
628 << context->device_ordinal();
629 }
630 }
631
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)632 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
633 uint64 bytes) {
634 ScopedActivateContext activated{context};
635
636 LOG(ERROR)
637 << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
638 return nullptr;
639 }
640
UnifiedMemoryDeallocate(GpuContext * context,void * location)641 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
642 void* location) {
643 LOG(ERROR)
644 << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
645 }
646
HostAllocate(GpuContext * context,uint64 bytes)647 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
648 ScopedActivateContext activation{context};
649 void* host_mem = nullptr;
650 // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
651 hipError_t res =
652 tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
653 if (res != hipSuccess) {
654 LOG(ERROR) << "failed to alloc " << bytes
655 << " bytes on host: " << ToString(res);
656 }
657 return host_mem;
658 }
659
HostDeallocate(GpuContext * context,void * location)660 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
661 void* location) {
662 ScopedActivateContext activation{context};
663 hipError_t res = tensorflow::wrap::hipHostFree(location);
664 if (res != hipSuccess) {
665 LOG(ERROR) << "error deallocating host memory at " << location << ": "
666 << ToString(res);
667 }
668 }
669
HostRegister(GpuContext * context,void * location,uint64 bytes)670 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
671 uint64 bytes) {
672 ScopedActivateContext activation{context};
673 // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
674 hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
675 hipHostRegisterPortable);
676 if (res != hipSuccess) {
677 LOG(ERROR) << "error registering host memory at " << location << ": "
678 << ToString(res);
679 return false;
680 }
681 return true;
682 }
683
HostUnregister(GpuContext * context,void * location)684 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
685 void* location) {
686 ScopedActivateContext activation{context};
687 hipError_t res = tensorflow::wrap::hipHostUnregister(location);
688 if (res != hipSuccess) {
689 LOG(ERROR) << "error unregistering host memory at " << location << ": "
690 << ToString(res);
691 return false;
692 }
693 return true;
694 }
695
DestroyEvent(GpuContext * context,GpuEventHandle * event)696 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
697 GpuEventHandle* event) {
698 if (*event == nullptr) {
699 return port::Status{port::error::INVALID_ARGUMENT,
700 "input event cannot be null"};
701 }
702
703 ScopedActivateContext activated{context};
704 hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
705 *event = nullptr;
706
707 switch (res) {
708 case hipSuccess:
709 return port::Status::OK();
710 case hipErrorDeinitialized:
711 case hipErrorNotInitialized:
712 return port::Status{
713 port::error::FAILED_PRECONDITION,
714 absl::StrFormat("error destroying ROCM event in device %d: %s",
715 context->device_ordinal(), ToString(res).c_str())};
716 default:
717 return port::Status{
718 port::error::INTERNAL,
719 absl::StrFormat("error destroying ROCM event in device %d: %s",
720 context->device_ordinal(), ToString(res).c_str())};
721 }
722 }
723
RecordEvent(GpuContext * context,GpuEventHandle event,GpuStreamHandle stream)724 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
725 GpuEventHandle event,
726 GpuStreamHandle stream) {
727 ScopedActivateContext activated{context};
728 hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
729 switch (res) {
730 case hipSuccess:
731 return port::Status::OK();
732 case hipErrorDeinitialized:
733 case hipErrorNotInitialized:
734 return port::Status{
735 port::error::FAILED_PRECONDITION,
736 absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
737 ToString(res).c_str())};
738 default:
739 return port::Status{
740 port::error::INVALID_ARGUMENT,
741 absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
742 ToString(res).c_str())};
743 }
744 }
745
QueryEvent(GpuContext * context,GpuEventHandle event)746 /* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
747 GpuContext* context, GpuEventHandle event) {
748 ScopedActivateContext activated{context};
749 hipError_t res = tensorflow::wrap::hipEventQuery(event);
750 if (res != hipSuccess && res != hipErrorNotReady) {
751 return port::Status{
752 port::error::INTERNAL,
753 absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
754 }
755
756 return res;
757 }
758
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,GpuEventHandle start,GpuEventHandle stop)759 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
760 float* elapsed_milliseconds,
761 GpuEventHandle start,
762 GpuEventHandle stop) {
763 ScopedActivateContext activated{context};
764 // The stop event must have completed in order for hipEventElapsedTime to
765 // work.
766 hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
767 if (res != hipSuccess) {
768 LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
769 return false;
770 }
771 res =
772 tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
773 if (res != hipSuccess) {
774 LOG(ERROR) << "failed to get elapsed time between events: "
775 << ToString(res);
776 return false;
777 }
778
779 return true;
780 }
781
WaitStreamOnEvent(GpuContext * context,GpuStreamHandle stream,GpuEventHandle event)782 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
783 GpuStreamHandle stream,
784 GpuEventHandle event) {
785 ScopedActivateContext activation{context};
786 hipError_t res =
787 tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
788 if (res != hipSuccess) {
789 LOG(ERROR) << "could not wait stream on event: " << ToString(res);
790 return false;
791 }
792
793 return true;
794 }
795
SynchronizeContext(GpuContext * context)796 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
797 ScopedActivateContext activation{context};
798 hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
799 if (res != hipSuccess) {
800 LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
801 << " :: " << port::CurrentStackTrace();
802 return false;
803 }
804
805 return true;
806 }
807
SynchronizeStream(GpuContext * context,GpuStreamHandle stream)808 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
809 GpuStreamHandle stream) {
810 ScopedActivateContext activated{context};
811 CHECK(stream != nullptr);
812 RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipStreamSynchronize(stream),
813 "Could not synchronize on ROCM stream");
814 VLOG(2) << "successfully synchronized stream " << stream << " on device "
815 << context->device_ordinal();
816 return port::Status::OK();
817 }
818
IsStreamIdle(GpuContext * context,GpuStreamHandle stream)819 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
820 GpuStreamHandle stream) {
821 ScopedActivateContext activated{context};
822 CHECK(stream != nullptr);
823 hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
824 if (res == hipSuccess) {
825 return true;
826 }
827
828 if (res != hipErrorNotReady) {
829 LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
830 }
831 return false;
832 }
833
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64 size)834 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
835 GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
836 ScopedActivateContext activation{context};
837 RETURN_IF_ROCM_ERROR(
838 tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size),
839 absl::StrFormat("failed to synchronous memcpy from device to host: "
840 "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
841 host_dst, absl::bit_cast<void*>(gpu_src), size, size));
842 VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
843 << host_dst;
844 return port::Status::OK();
845 }
846
SynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64 size)847 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
848 GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
849 uint64 size) {
850 ScopedActivateContext activation{context};
851 RETURN_IF_ROCM_ERROR(
852 tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src),
853 size),
854 absl::StrFormat(
855 "failed to synchronous memcpy from host to device: Gpu dst: %p;"
856 " host src: %p; size: %llu=0x%llx",
857 absl::bit_cast<void*>(gpu_dst), host_src, size, size));
858 VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
859 return port::Status::OK();
860 }
861
SynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64 size)862 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
863 GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
864 uint64 size) {
865 ScopedActivateContext activation{context};
866 RETURN_IF_ROCM_ERROR(
867 tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size),
868 absl::StrFormat(
869 "failed to synchronous memcpy from host to device:Gpu dst: %p; "
870 "Gpu src: %p; size: %llu=0x%llx",
871 absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
872 size));
873 VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
874 return port::Status::OK();
875 }
876
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64 size,GpuStreamHandle stream)877 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
878 void* host_dst,
879 hipDeviceptr_t gpu_src,
880 uint64 size,
881 GpuStreamHandle stream) {
882 ScopedActivateContext activation{context};
883 hipError_t res =
884 tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
885 if (res != hipSuccess) {
886 LOG(ERROR) << absl::StrFormat(
887 "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
888 "Gpu src: %p; size: %llu=0x%llx",
889 ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
890 size);
891 return false;
892 }
893 VLOG(2) << "successfully enqueued async memcpy d2h of " << size
894 << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
895 << host_dst << " on stream " << stream;
896 return true;
897 }
898
AsynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64 size,GpuStreamHandle stream)899 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
900 hipDeviceptr_t gpu_dst,
901 const void* host_src,
902 uint64 size,
903 GpuStreamHandle stream) {
904 ScopedActivateContext activation{context};
905 hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
906 gpu_dst, const_cast<void*>(host_src), size, stream);
907 if (res != hipSuccess) {
908 LOG(ERROR) << absl::StrFormat(
909 "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
910 "host src: %p; size: %llu=0x%llx",
911 ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
912 size);
913 return false;
914 }
915 VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
916 << " on stream " << stream;
917 return true;
918 }
919
AsynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64 size,GpuStreamHandle stream)920 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
921 hipDeviceptr_t gpu_dst,
922 hipDeviceptr_t gpu_src,
923 uint64 size,
924 GpuStreamHandle stream) {
925 ScopedActivateContext activation{context};
926 hipError_t result =
927 tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
928 if (result != hipSuccess) {
929 LOG(ERROR) << absl::StrFormat(
930 "failed to enqueue async memcpy from device to device: %s"
931 "; Gpu dst: %p on %s %s"
932 "; Gpu src: %p on %s %s"
933 "; can access? %s; size: %llu=0x%llx",
934 ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
935 ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
936 ROCMPointerToDeviceString(gpu_dst).c_str(),
937 absl::bit_cast<void*>(gpu_src),
938 ROCMPointerToMemorySpaceString(gpu_src).c_str(),
939 ROCMPointerToDeviceString(gpu_src).c_str(),
940 ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
941
942 return false;
943 }
944 VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
945 return true;
946 }
947
InitEvent(GpuContext * context,GpuEventHandle * event,EventFlags flags)948 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
949 GpuEventHandle* event,
950 EventFlags flags) {
951 int hipflags;
952 switch (flags) {
953 case EventFlags::kDefault:
954 hipflags = hipEventDefault;
955 break;
956 case EventFlags::kDisableTiming:
957 hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
958 break;
959 default:
960 LOG(FATAL) << "impossible event flags: " << int(hipflags);
961 }
962
963 ScopedActivateContext activated{context};
964 hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
965
966 if (res == hipSuccess) {
967 return port::Status::OK();
968 } else if (res == hipErrorMemoryAllocation) {
969 return port::Status{port::error::RESOURCE_EXHAUSTED,
970 "could not create ROCM event: out of device memory"};
971 } else {
972 return port::Status{
973 port::error::FAILED_PRECONDITION,
974 absl::StrCat("could not create ROCM event: ", ToString(res))};
975 }
976 }
977
GetDeviceCount()978 /* static */ int GpuDriver::GetDeviceCount() {
979 int device_count = 0;
980 hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
981 if (res != hipSuccess) {
982 LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
983 return 0;
984 }
985
986 if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
987 device_count = 1;
988 }
989 return device_count;
990 }
991
GetComputeCapability(int * cc_major,int * cc_minor,hipDevice_t device)992 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
993 int* cc_minor,
994 hipDevice_t device) {
995 return port::Status(
996 port::error::INTERNAL,
997 absl::StrFormat("failed to get compute capability for device: %d "
998 "(unsupported API on AMD Gpus)",
999 device));
1000 }
1001
GetPointerAddressRange(hipDeviceptr_t dptr,hipDeviceptr_t * base,size_t * size)1002 /* static */ port::Status GpuDriver::GetPointerAddressRange(
1003 hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
1004 hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
1005 if (result == hipSuccess) {
1006 return port::Status::OK();
1007 } else if (result == hipErrorNotFound) {
1008 // We differentiate between "this pointer is unknown" (return here) and
1009 // "there was an internal error while performing this operation" (return
1010 // below).
1011 return port::Status{port::error::NOT_FOUND,
1012 absl::StrFormat("not a device pointer %p; %s",
1013 reinterpret_cast<void*>(dptr),
1014 ToString(result).c_str())};
1015 }
1016
1017 return port::Status{
1018 port::error::INTERNAL,
1019 absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1020 reinterpret_cast<void*>(dptr), ToString(result).c_str())};
1021 }
1022
GetPointerMemorySpace(hipDeviceptr_t pointer)1023 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1024 hipDeviceptr_t pointer) {
1025 unsigned int value;
1026 hipError_t result = hipSuccess;
1027 if (result == hipSuccess) {
1028 switch (value) {
1029 case hipMemoryTypeDevice:
1030 return MemorySpace::kDevice;
1031 case hipMemoryTypeHost:
1032 return MemorySpace::kHost;
1033 default:
1034 return port::Status{
1035 port::error::INTERNAL,
1036 absl::StrCat("unknown memory space provided by ROCM API: ", value)};
1037 }
1038 }
1039
1040 return port::Status{
1041 port::error::INTERNAL,
1042 absl::StrCat("failed to query device pointer for memory space: ",
1043 ToString(result))};
1044 }
1045
GetPointerDevice(hipDeviceptr_t pointer)1046 /* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
1047 hipDeviceptr_t pointer) {
1048 hipPointerAttribute_t pointerAttributes;
1049 hipError_t result =
1050 tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
1051 if (result != hipSuccess) {
1052 return port::Status{
1053 port::error::INTERNAL,
1054 absl::StrCat("failed to get device for pointer: ", ToString(result))};
1055 }
1056
1057 hipDevice_t device;
1058 result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
1059 if (result != hipSuccess) {
1060 return port::Status{
1061 port::error::INTERNAL,
1062 absl::StrCat("failed to get device for pointer: ", ToString(result))};
1063 }
1064
1065 return device;
1066 }
1067
GetGpuISAVersion(int * version,hipDevice_t device)1068 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1069 hipDevice_t device) {
1070 hipDeviceProp_t props;
1071 hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1072 if (result == hipSuccess) {
1073 *version = props.gcnArch;
1074 return port::Status::OK();
1075 }
1076 *version = 0;
1077 return port::Status{
1078 port::error::INTERNAL,
1079 absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
1080 device)};
1081 }
1082
GetGpuGCNArchName(hipDevice_t device,std::string * gcnArchName)1083 /* static */ port::Status GpuDriver::GetGpuGCNArchName(
1084 hipDevice_t device, std::string* gcnArchName) {
1085 hipDeviceProp_t props;
1086 hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1087 if (result == hipSuccess) {
1088 *gcnArchName = props.gcnArchName;
1089 return port::Status::OK();
1090 }
1091 *gcnArchName = "";
1092 return port::Status{
1093 port::error::INTERNAL,
1094 absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
1095 device)};
1096 }
1097
1098 // Helper function that turns the integer output of hipDeviceGetAttribute to
1099 // type T and wraps it in a StatusOr.
1100 template <typename T>
GetSimpleAttribute(hipDevice_t device,hipDeviceAttribute_t attribute)1101 static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
1102 hipDeviceAttribute_t attribute) {
1103 int value = -1;
1104 hipError_t result =
1105 tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
1106 if (result != hipSuccess) {
1107 return port::Status{
1108 port::error::NOT_FOUND,
1109 absl::StrCat("could not retrieve ROCM device attribute (", attribute,
1110 "): ", ToString(result))};
1111 }
1112 T converted = value;
1113 return converted;
1114 }
1115
GetMultiprocessorCount(hipDevice_t device)1116 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1117 hipDevice_t device) {
1118 return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
1119 }
1120
GetMaxSharedMemoryPerCore(hipDevice_t device)1121 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1122 hipDevice_t device) {
1123 return GetSimpleAttribute<int64>(
1124 device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
1125 }
1126
GetMaxSharedMemoryPerBlock(hipDevice_t device)1127 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1128 hipDevice_t device) {
1129 return GetSimpleAttribute<int64>(device,
1130 hipDeviceAttributeMaxSharedMemoryPerBlock);
1131 }
1132
GetMaxThreadsPerMultiprocessor(hipDevice_t device)1133 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1134 hipDevice_t device) {
1135 return GetSimpleAttribute<int64>(
1136 device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
1137 }
1138
GetMaxThreadsPerBlock(hipDevice_t device)1139 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1140 hipDevice_t device) {
1141 return GetSimpleAttribute<int64>(device,
1142 hipDeviceAttributeMaxThreadsPerBlock);
1143 }
1144
GetMaxRegistersPerBlock(hipDevice_t device)1145 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1146 hipDevice_t device) {
1147 return GetSimpleAttribute<int64>(device,
1148 hipDeviceAttributeMaxRegistersPerBlock);
1149 }
1150
GetThreadsPerWarp(hipDevice_t device)1151 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1152 hipDevice_t device) {
1153 return GetSimpleAttribute<int64>(device, hipDeviceAttributeWarpSize);
1154 }
1155
GetGridLimits(int * x,int * y,int * z,hipDevice_t device)1156 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1157 hipDevice_t device) {
1158 int value;
1159 hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
1160 &value, hipDeviceAttributeMaxGridDimX, device);
1161 if (res != hipSuccess) {
1162 LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1163 return false;
1164 }
1165 *x = value;
1166
1167 res = tensorflow::wrap::hipDeviceGetAttribute(
1168 &value, hipDeviceAttributeMaxGridDimY, device);
1169 if (res != hipSuccess) {
1170 LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1171 return false;
1172 }
1173 *y = value;
1174
1175 res = tensorflow::wrap::hipDeviceGetAttribute(
1176 &value, hipDeviceAttributeMaxGridDimZ, device);
1177 if (res != hipSuccess) {
1178 LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1179 return false;
1180 }
1181 *z = value;
1182 return true;
1183 }
1184
GetDriverVersion(int * driver_version)1185 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1186 hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
1187 if (res != hipSuccess) {
1188 LOG(ERROR) << "failed to query driver version: " << ToString(res);
1189 return false;
1190 }
1191
1192 return true;
1193 }
1194
GetDeviceProperties(hipDeviceProp_t * device_properties,int device_ordinal)1195 /* static */ bool GpuDriver::GetDeviceProperties(
1196 hipDeviceProp_t* device_properties, int device_ordinal) {
1197 hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
1198 device_ordinal);
1199 if (res != hipSuccess) {
1200 LOG(ERROR) << "failed to query device properties: " << ToString(res);
1201 return false;
1202 }
1203
1204 return true;
1205 }
1206
GetDeviceAttribute(hipDeviceAttribute_t attribute,hipDevice_t device)1207 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1208 hipDeviceAttribute_t attribute, hipDevice_t device) {
1209 return GetSimpleAttribute<int>(device, attribute);
1210 }
1211
IsEccEnabled(hipDevice_t device,bool * result)1212 /* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
1213 int value = -1;
1214 hipError_t res = hipSuccess;
1215 // TODO(ROCm) implement this feature in HIP
1216 if (res != hipSuccess) {
1217 LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1218 return false;
1219 }
1220
1221 *result = value;
1222 return true;
1223 }
1224
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1225 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1226 int64* free_out,
1227 int64* total_out) {
1228 ScopedActivateContext activation{context};
1229 size_t free = 0;
1230 size_t total = 0;
1231 hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
1232 if (res != hipSuccess) {
1233 LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1234 return false;
1235 }
1236
1237 *free_out = free;
1238 *total_out = total;
1239 return true;
1240 }
1241
GetDeviceTotalMemory(hipDevice_t device,uint64 * result)1242 /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
1243 uint64* result) {
1244 size_t value = -1;
1245 hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
1246 if (res != hipSuccess) {
1247 LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1248 return false;
1249 }
1250
1251 *result = value;
1252 return true;
1253 }
1254
GetPCIBusID(hipDevice_t device)1255 /* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
1256 string pci_bus_id;
1257 static const int kBufferSize = 64;
1258 absl::InlinedVector<char, 4> chars(kBufferSize);
1259 chars[kBufferSize - 1] = '\0';
1260 hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
1261 chars.begin(), kBufferSize - 1, device);
1262 if (res != hipSuccess) {
1263 LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1264 return pci_bus_id;
1265 }
1266 pci_bus_id = chars.begin();
1267 return pci_bus_id;
1268 }
1269
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1270 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1271 GpuContext* to) {
1272 if (from->device_ordinal() == to->device_ordinal()) {
1273 return true; // A device can always access its own memory.
1274 }
1275
1276 int can_access_peer = -1;
1277 hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
1278 &can_access_peer, from->device_ordinal(), to->device_ordinal());
1279 if (res != hipSuccess) {
1280 LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
1281 return false;
1282 }
1283
1284 return can_access_peer;
1285 }
1286
EnablePeerAccess(GpuContext * from,GpuContext * to)1287 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1288 GpuContext* to) {
1289 if (from->device_ordinal() == to->device_ordinal()) {
1290 return port::Status::OK(); // A device can always access its own memory.
1291 }
1292
1293 ScopedActivateContext activated{from};
1294 hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
1295 to->device_ordinal(), 0 /* = flags */);
1296 if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
1297 return port::Status{
1298 port::error::INTERNAL,
1299 absl::StrFormat("failed to enable peer access from %d to %d: %s",
1300 from->device_ordinal(), to->device_ordinal(),
1301 ToString(result).c_str())};
1302 }
1303
1304 return port::Status::OK();
1305 }
1306
GetMaxOccupiedBlocksPerCore(GpuContext * context,hipFunction_t kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1307 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1308 GpuContext* context, hipFunction_t kernel, int threads_per_block,
1309 size_t dynamic_shared_memory_bytes) {
1310 ScopedActivateContext activation{context};
1311
1312 int max_blocks = 0;
1313 hipError_t result = hipSuccess;
1314 // TODO(ROCm) implement this feature in HIP
1315 if (result != hipSuccess) {
1316 return port::Status{
1317 port::error::INTERNAL,
1318 absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
1319 kernel, ToString(result).c_str())};
1320 }
1321
1322 return max_blocks;
1323 }
1324
1325 } // namespace gpu
1326 } // namespace stream_executor
1327