• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <stdint.h>
17 #include <stdlib.h>
18 
19 #include <map>
20 #include <set>
21 #include <utility>
22 
23 #include "absl/base/casts.h"
24 #include "absl/container/inlined_vector.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/synchronization/mutex.h"
28 #include "absl/synchronization/notification.h"
29 #include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
30 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
31 #include "tensorflow/stream_executor/lib/env.h"
32 #include "tensorflow/stream_executor/lib/error.h"
33 #include "tensorflow/stream_executor/lib/human_readable.h"
34 #include "tensorflow/stream_executor/lib/stacktrace.h"
35 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
36 #include "tensorflow/stream_executor/lib/threadpool.h"
37 #include "tensorflow/stream_executor/platform/logging.h"
38 #include "tensorflow/stream_executor/platform/port.h"
39 #include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
40 
41 bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
42 bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
43 bool FLAGS_gpuexec_rocm_device_0_only = false;
44 
45 #define RETURN_IF_ROCM_ERROR(expr, ...)                                \
46   do {                                                                 \
47     hipError_t _res = (expr);                                          \
48     if (TF_PREDICT_FALSE(_res != hipSuccess)) {                        \
49       return port::InternalError(absl::StrCat(                         \
50           __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
51     }                                                                  \
52   } while (0)
53 
54 // Debugging: on each push and pop of a rocm context, verify the current device
55 // matches the expected one.
56 constexpr bool kVerifyGpuContext = false;
57 
58 namespace stream_executor {
59 namespace gpu {
60 
61 // GpuContext wraps the device_ordinal.
62 // Only reason we need this wrapper class is to make the GpuDriver* API
63 class GpuContext {
64  public:
GpuContext(const int v)65   GpuContext(const int v) : device_ordinal_(v) {}
66 
device_ordinal() const67   int device_ordinal() const { return device_ordinal_; }
68 
69   // Disallow copying and moving.
70   GpuContext(GpuContext&&) = delete;
71   GpuContext(const GpuContext&) = delete;
72   GpuContext& operator=(GpuContext&&) = delete;
73   GpuContext& operator=(const GpuContext&) = delete;
74 
75  private:
76   const int device_ordinal_;
77 };
78 
79 namespace {
80 
81 // Formats hipError_t to output prettified values into a log stream.
82 // Error summaries taken from:
ToString(hipError_t result)83 string ToString(hipError_t result) {
84 #define OSTREAM_ROCM_ERROR(__name) \
85   case hipError##__name:           \
86     return "HIP_ERROR_" #__name;
87 
88   switch (result) {
89     OSTREAM_ROCM_ERROR(InvalidValue)
90     OSTREAM_ROCM_ERROR(OutOfMemory)
91     OSTREAM_ROCM_ERROR(NotInitialized)
92     OSTREAM_ROCM_ERROR(Deinitialized)
93     OSTREAM_ROCM_ERROR(NoDevice)
94     OSTREAM_ROCM_ERROR(InvalidDevice)
95     OSTREAM_ROCM_ERROR(InvalidImage)
96     OSTREAM_ROCM_ERROR(InvalidContext)
97     OSTREAM_ROCM_ERROR(InvalidHandle)
98     OSTREAM_ROCM_ERROR(NotFound)
99     OSTREAM_ROCM_ERROR(NotReady)
100     OSTREAM_ROCM_ERROR(NoBinaryForGpu)
101 
102     // Encountered an uncorrectable ECC error during execution.
103     OSTREAM_ROCM_ERROR(ECCNotCorrectable)
104 
105     // Load/store on an invalid address. Must reboot all context.
106     case 700:
107       return "ROCM_ERROR_ILLEGAL_ADDRESS";
108     // Passed too many / wrong arguments, too many threads for register count.
109     case 701:
110       return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
111 
112       OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
113       OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
114       OSTREAM_ROCM_ERROR(Unknown)  // Unknown internal error to ROCM.
115     default:
116       return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
117   }
118 }
119 
120 // ROCM driver routines may require a large amount of stack (particularly
121 // hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
122 // stack-limited threads (such as those spawned by a default-argument
123 // thread::ThreadPool on some platforms), we run certain routines in this pool
124 // and wait for completion.
GetDriverExecutor()125 port::ThreadPool* GetDriverExecutor() {
126   static port::ThreadPool* thread_pool = new port::ThreadPool(
127       port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
128   return thread_pool;
129 }
130 
131 }  // namespace
132 
MemorySpaceString(MemorySpace memory_space)133 string MemorySpaceString(MemorySpace memory_space) {
134   switch (memory_space) {
135     case MemorySpace::kHost:
136       return "host";
137     case MemorySpace::kDevice:
138       return "device";
139     default:
140       LOG(FATAL) << "impossible memory space";
141   }
142 }
143 
144 // Returns the current device set in HIP. This is done by calling the
145 // HIP driver (e.g., this value is not our cached view of the current device).
CurrentDeviceOrDie()146 static int CurrentDeviceOrDie() {
147   int current = -1;
148   hipError_t result = tensorflow::wrap::hipGetDevice(&current);
149   if (result != hipSuccess) {
150     LOG(FATAL) << "failed to query current device: " << ToString(result);
151   }
152   return current;
153 }
154 
155 namespace {
156 
157 // Call hipDeviceSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()158 void SynchronizeOrDie() {
159   auto res = tensorflow::wrap::hipDeviceSynchronize();
160   if (res != hipSuccess) {
161     LOG(FATAL) << "Synchronize found " << ToString(res)
162                << " :: " << port::CurrentStackTrace();
163   }
164 }
165 
166 struct ThreadLocalData {
167   int current_device_ordinal;
168   int depth;
169 };
170 
171 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
172 
173 }  // namespace
174 
ScopedActivateContext(GpuContext * context)175 ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
176   if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
177     SynchronizeOrDie();
178   }
179 
180   auto* tls = &tls_data.get();
181   if (tls->depth == 0) {
182     tls->current_device_ordinal = CurrentDeviceOrDie();
183   }
184 
185   if (kVerifyGpuContext) {
186     CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
187   }
188 
189   tls->depth++;
190 
191   to_restore_ = context;
192 
193   if (context->device_ordinal() == tls->current_device_ordinal) {
194     DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
195     return;
196   }
197 
198   VLOG(3) << "ScopedActivateContext switching device from "
199           << tls->current_device_ordinal << " to " << context->device_ordinal();
200 
201   // Set the device and update thread local.
202   CHECK_EQ(hipSuccess,
203            tensorflow::wrap::hipSetDevice(context->device_ordinal()));
204   tls->current_device_ordinal = context->device_ordinal();
205 }
206 
~ScopedActivateContext()207 ScopedActivateContext::~ScopedActivateContext() {
208   if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
209     SynchronizeOrDie();
210   }
211 
212   auto* tls = &tls_data.get();
213 
214   if (kVerifyGpuContext) {
215     CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
216   }
217 
218   tls->depth--;
219   DCHECK_GE(tls->depth, 0);
220 
221   if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
222     DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
223     return;
224   }
225 
226   VLOG(3) << "ScopedActivateContext switching device from "
227           << tls->current_device_ordinal << " to "
228           << to_restore_->device_ordinal();
229 
230   // Set context and update thread local.
231   CHECK_EQ(hipSuccess,
232            tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
233   tls->current_device_ordinal = to_restore_->device_ordinal();
234 }
235 
236 namespace {
237 
238 // Returns a stringified device number associated with pointer, primarily for
239 // logging purposes. Returns "?" if the device could not be successfully
240 // queried.
ROCMPointerToDeviceString(hipDeviceptr_t pointer)241 string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
242   auto value = GpuDriver::GetPointerDevice(pointer);
243   if (value.ok()) {
244     return absl::StrCat(value.ValueOrDie());
245   }
246   LOG(ERROR) << "could not query device: " << value.status();
247   return "?";
248 }
249 
250 // Returns a stringified memory space associated with pointer, primarily for
251 // logging purposes. Returns "?" if the memory space could not be successfully
252 // queried.
ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer)253 string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
254   auto value = GpuDriver::GetPointerMemorySpace(pointer);
255   if (value.ok()) {
256     return MemorySpaceString(value.ValueOrDie());
257   }
258   LOG(ERROR) << "could not query device: " << value.status();
259   return "?";
260 }
261 
262 // Returns a stringified representation of whether or not peer access is
263 // permitted between the "from" and "to" pointers' associated contexts,
264 // primarily for logging purposes. Returns "error" if an error is encountered
265 // in the process of querying.
ROCMPointersToCanAccessString(hipDeviceptr_t from,hipDeviceptr_t to)266 string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
267   hipPointerAttribute_t from_pointerAttributes;
268   hipError_t result =
269       tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
270   if (result != hipSuccess) {
271     LOG(ERROR) << "could not retrieve source pointer's device: "
272                << ToString(result);
273     return "error";
274   }
275 
276   hipPointerAttribute_t to_pointerAttributes;
277   result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
278   if (result != hipSuccess) {
279     LOG(ERROR) << "could not retrieve destination pointer's device: "
280                << ToString(result);
281     return "error";
282   }
283 
284   GpuContext fromCtx(from_pointerAttributes.device);
285   GpuContext toCtx(to_pointerAttributes.device);
286 
287   return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
288 }
289 
290 // Actually performs the work of ROCM initialization. Wrapped up in one-time
291 // execution guard.
InternalInit()292 static port::Status InternalInit() {
293   hipError_t res = hipErrorNoDevice;
294   if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
295     LOG(ERROR) << "injecting ROCM init error; initialization will fail";
296   } else {
297     res = tensorflow::wrap::hipInit(0 /* = flags */);
298   }
299 
300   if (res == hipSuccess) {
301     return port::Status::OK();
302   }
303 
304   LOG(ERROR) << "failed call to hipInit: " << ToString(res);
305   Diagnostician::LogDiagnosticInformation();
306   return port::Status{port::error::ABORTED,
307                       absl::StrCat("failed call to hipInit: ", ToString(res))};
308 }
309 
310 }  // namespace
311 
Init()312 /* static */ port::Status GpuDriver::Init() {
313   // Cached return value from calling InternalInit(), as hipInit need only be
314   // called once, but GpuDriver::Init may be called many times.
315   static port::Status* init_retval = [] {
316     return new port::Status(InternalInit());
317   }();
318   return *init_retval;
319 }
320 
GetDevice(int device_ordinal,hipDevice_t * device)321 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
322                                                hipDevice_t* device) {
323   hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
324   if (res == hipSuccess) {
325     return port::Status::OK();
326   }
327 
328   return port::Status{
329       port::error::INTERNAL,
330       absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
331 }
332 
GetDeviceName(hipDevice_t device,string * device_name)333 /* static */ port::Status GpuDriver::GetDeviceName(hipDevice_t device,
334                                                    string* device_name) {
335   static const size_t kCharLimit = 64;
336   absl::InlinedVector<char, 4> chars(kCharLimit);
337   RETURN_IF_ROCM_ERROR(
338       tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device),
339       "Failed to get device name");
340   chars[kCharLimit - 1] = '\0';
341   *device_name = chars.begin();
342   return port::Status::OK();
343 }
344 
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)345 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
346                                  int* flags) {
347   static_assert(DeviceOptions::kMask == 0xf,
348                 "needs update for new device options");
349   return true;
350 }
351 
CreateContext(int device_ordinal,hipDevice_t device,const DeviceOptions & device_options,GpuContext ** context)352 /* static */ port::Status GpuDriver::CreateContext(
353     int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
354     GpuContext** context) {
355   *context = new GpuContext(device_ordinal);
356   return port::Status::OK();
357 }
DestroyContext(GpuContext * context)358 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
359   if (context == nullptr) {
360     return;
361   }
362   delete context;
363 }
364 
FuncGetAttribute(hipDeviceAttribute_t attribute,hipFunction_t func,int * attribute_value)365 /* static */ port::Status GpuDriver::FuncGetAttribute(
366     hipDeviceAttribute_t attribute, hipFunction_t func, int* attribute_value) {
367   // TODO(ROCm) properly implement this feature in HIP
368   return port::Status::OK();
369 }
370 
FuncSetCacheConfig(hipFunction_t function,hipFuncCache_t cache_config)371 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
372     hipFunction_t function, hipFuncCache_t cache_config) {
373   RETURN_IF_ROCM_ERROR(
374       tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config),
375       "Failed to set ROCM kernel cache config.");
376   return port::Status::OK();
377 }
378 
379 /* static */ port::StatusOr<hipSharedMemConfig>
ContextGetSharedMemConfig(GpuContext * context)380 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
381   hipSharedMemConfig shared_mem_config;
382   ScopedActivateContext activation{context};
383   RETURN_IF_ROCM_ERROR(
384       tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config),
385       "Failed to get shared memory config");
386   return shared_mem_config;
387 }
388 
ContextSetSharedMemConfig(GpuContext * context,hipSharedMemConfig shared_mem_config)389 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
390     GpuContext* context, hipSharedMemConfig shared_mem_config) {
391   ScopedActivateContext activation{context};
392   RETURN_IF_ROCM_ERROR(
393       tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config),
394       "Failed to set ROCM device shared memory config");
395   return port::Status::OK();
396 }
397 
LaunchKernel(GpuContext * context,hipFunction_t function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,GpuStreamHandle stream,void ** kernel_params,void ** extra)398 /* static */ port::Status GpuDriver::LaunchKernel(
399     GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
400     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
401     unsigned int block_dim_y, unsigned int block_dim_z,
402     unsigned int shared_mem_bytes, GpuStreamHandle stream, void** kernel_params,
403     void** extra) {
404   ScopedActivateContext activation{context};
405   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
406           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
407           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
408           << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
409   RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipModuleLaunchKernel(
410                            function, grid_dim_x, grid_dim_y, grid_dim_z,
411                            block_dim_x, block_dim_y, block_dim_z,
412                            shared_mem_bytes, stream, kernel_params, extra),
413                        "Failed to launch ROCM kernel");
414   VLOG(2) << "successfully launched kernel";
415   return port::Status::OK();
416 }
417 
LoadPtx(GpuContext * context,const char * ptx_contents,hipModule_t * module)418 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
419                                              const char* ptx_contents,
420                                              hipModule_t* module) {
421   LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
422   return port::InternalError("Not Implemented");
423 }
424 
LoadCubin(GpuContext * context,const char * cubin_bytes,hipModule_t * module)425 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
426                                                const char* cubin_bytes,
427                                                hipModule_t* module) {
428   return port::Status{port::error::INTERNAL,
429                       "Feature not supported on ROCm platform (LoadCubin)"};
430 }
431 
LoadHsaco(GpuContext * context,const char * hsaco_contents,hipModule_t * module)432 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
433                                                const char* hsaco_contents,
434                                                hipModule_t* module) {
435   absl::Notification notification;
436   port::Status ret = port::Status::OK();
437   GetDriverExecutor()->Schedule([context, hsaco_contents, module, &ret,
438                                  &notification]() {
439     ScopedActivateContext activation{context};
440     void* hsaco_data = const_cast<char*>(hsaco_contents);
441 
442     hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
443 
444     if (res != hipSuccess) {
445       ret = port::InternalError(
446           absl::StrCat("Failed to load HSACO: ", ToString(res)));
447       notification.Notify();
448     }
449 
450     CHECK(module != nullptr);
451     notification.Notify();
452   });
453   notification.WaitForNotification();
454 
455   return ret;
456 }
457 
SynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t size)458 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
459     GpuContext* context, hipDeviceptr_t location, uint8 value, size_t size) {
460   ScopedActivateContext activation{context};
461   RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipMemsetD8(location, value, size),
462                        "Failed to memset memory");
463   return port::Status::OK();
464 }
465 
SynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count)466 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
467     GpuContext* context, hipDeviceptr_t location, uint32 value,
468     size_t uint32_count) {
469   ScopedActivateContext activation{context};
470   void* pointer = absl::bit_cast<void*>(location);
471   RETURN_IF_ROCM_ERROR(
472       tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count),
473       "Failed to memset memory");
474   return port::Status::OK();
475 }
476 
AsynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t uint32_count,GpuStreamHandle stream)477 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
478     GpuContext* context, hipDeviceptr_t location, uint8 value,
479     size_t uint32_count, GpuStreamHandle stream) {
480   ScopedActivateContext activation{context};
481   RETURN_IF_ROCM_ERROR(
482       tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream),
483       "Failed to enqueue async memset operation");
484   return port::Status::OK();
485 }
486 
AsynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count,GpuStreamHandle stream)487 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
488     GpuContext* context, hipDeviceptr_t location, uint32 value,
489     size_t uint32_count, GpuStreamHandle stream) {
490   ScopedActivateContext activation{context};
491   void* pointer = absl::bit_cast<void*>(location);
492   RETURN_IF_ROCM_ERROR(
493       tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream),
494       "Failed to enqueue async memset operation");
495   VLOG(2) << "successfully enqueued async memset operation";
496   return port::Status::OK();
497 }
498 
AddStreamCallback(GpuContext * context,GpuStreamHandle stream,StreamCallback callback,void * data)499 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
500                                                GpuStreamHandle stream,
501                                                StreamCallback callback,
502                                                void* data) {
503   hipError_t res = tensorflow::wrap::hipStreamAddCallback(
504       stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
505   if (res != hipSuccess) {
506     LOG(ERROR) << "unable to add host callback: " << ToString(res);
507     return false;
508   }
509   return true;
510 }
511 
GetModuleFunction(GpuContext * context,hipModule_t module,const char * kernel_name,hipFunction_t * function)512 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
513                                                hipModule_t module,
514                                                const char* kernel_name,
515                                                hipFunction_t* function) {
516   ScopedActivateContext activated{context};
517   CHECK(module != nullptr && kernel_name != nullptr);
518   hipError_t res =
519       tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
520   if (res != hipSuccess) {
521     LOG(ERROR) << "failed to get kernel \"" << kernel_name
522                << "\" from module: " << ToString(res);
523     return false;
524   }
525 
526   return true;
527 }
528 
GetModuleSymbol(GpuContext * context,hipModule_t module,const char * symbol_name,hipDeviceptr_t * dptr,size_t * bytes)529 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
530                                              hipModule_t module,
531                                              const char* symbol_name,
532                                              hipDeviceptr_t* dptr,
533                                              size_t* bytes) {
534   ScopedActivateContext activated{context};
535   CHECK(module != nullptr && symbol_name != nullptr &&
536         (dptr != nullptr || bytes != nullptr));
537   hipError_t res =
538       tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
539   if (res != hipSuccess) {
540     // symbol may not be found in the current module, but it may reside in
541     // another module.
542     VLOG(2) << "failed to get symbol \"" << symbol_name
543             << "\" from module: " << ToString(res);
544     return false;
545   }
546 
547   return true;
548 }
549 
UnloadModule(GpuContext * context,hipModule_t module)550 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
551                                           hipModule_t module) {
552   ScopedActivateContext activated{context};
553   hipError_t res = tensorflow::wrap::hipModuleUnload(module);
554   if (res != hipSuccess) {
555     LOG(ERROR) << "failed to unload module " << module
556                << "; leaking: " << ToString(res);
557   }
558 }
559 
CreateStream(GpuContext * context,GpuStreamHandle * stream,int priority)560 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
561                                           GpuStreamHandle* stream,
562                                           int priority) {
563   ScopedActivateContext activated{context};
564   hipError_t res;
565   if (priority == 0) {
566     res = tensorflow::wrap::hipStreamCreateWithFlags(
567         stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
568   } else {
569     res = tensorflow::wrap::hipStreamCreateWithPriority(
570         stream, hipStreamDefault, priority);  // switch to hipStreamNonBlocking?
571   }
572   if (res != hipSuccess) {
573     LOG(ERROR) << "could not allocate ROCM stream for device "
574                << context->device_ordinal() << ": " << ToString(res);
575     return false;
576   }
577 
578   VLOG(2) << "successfully created stream " << *stream << " for device "
579           << context->device_ordinal() << " on thread";
580   return true;
581 }
582 
DestroyStream(GpuContext * context,GpuStreamHandle * stream)583 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
584                                            GpuStreamHandle* stream) {
585   if (*stream == nullptr) {
586     return;
587   }
588 
589   ScopedActivateContext activated{context};
590   hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
591   if (res != hipSuccess) {
592     LOG(ERROR) << "failed to destroy ROCM stream for device "
593                << context->device_ordinal() << ": " << ToString(res);
594   } else {
595     VLOG(2) << "successfully destroyed stream " << *stream << " for device "
596             << context->device_ordinal();
597     *stream = nullptr;
598   }
599 }
600 
DeviceAllocate(GpuContext * context,uint64 bytes)601 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
602                                              uint64 bytes) {
603   ScopedActivateContext activated{context};
604   hipDeviceptr_t result = 0;
605   hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
606   if (res != hipSuccess) {
607     LOG(ERROR) << "failed to allocate "
608                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
609                << " bytes) from device: " << ToString(res);
610     return nullptr;
611   }
612   void* ptr = reinterpret_cast<void*>(result);
613   VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
614           << " of " << bytes << " bytes";
615   return ptr;
616 }
617 
DeviceDeallocate(GpuContext * context,void * location)618 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
619                                               void* location) {
620   ScopedActivateContext activation{context};
621   hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
622   hipError_t res = tensorflow::wrap::hipFree(pointer);
623   if (res != hipSuccess) {
624     LOG(ERROR) << "failed to free device memory at " << location
625                << "; result: " << ToString(res);
626   } else {
627     VLOG(2) << "deallocated " << location << " for device "
628             << context->device_ordinal();
629   }
630 }
631 
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)632 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
633                                                     uint64 bytes) {
634   ScopedActivateContext activated{context};
635 
636   LOG(ERROR)
637       << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
638   return nullptr;
639 }
640 
UnifiedMemoryDeallocate(GpuContext * context,void * location)641 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
642                                                      void* location) {
643   LOG(ERROR)
644       << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
645 }
646 
HostAllocate(GpuContext * context,uint64 bytes)647 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
648   ScopedActivateContext activation{context};
649   void* host_mem = nullptr;
650   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
651   hipError_t res =
652       tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
653   if (res != hipSuccess) {
654     LOG(ERROR) << "failed to alloc " << bytes
655                << " bytes on host: " << ToString(res);
656   }
657   return host_mem;
658 }
659 
HostDeallocate(GpuContext * context,void * location)660 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
661                                             void* location) {
662   ScopedActivateContext activation{context};
663   hipError_t res = tensorflow::wrap::hipHostFree(location);
664   if (res != hipSuccess) {
665     LOG(ERROR) << "error deallocating host memory at " << location << ": "
666                << ToString(res);
667   }
668 }
669 
HostRegister(GpuContext * context,void * location,uint64 bytes)670 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
671                                           uint64 bytes) {
672   ScopedActivateContext activation{context};
673   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
674   hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
675                                                      hipHostRegisterPortable);
676   if (res != hipSuccess) {
677     LOG(ERROR) << "error registering host memory at " << location << ": "
678                << ToString(res);
679     return false;
680   }
681   return true;
682 }
683 
HostUnregister(GpuContext * context,void * location)684 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
685                                             void* location) {
686   ScopedActivateContext activation{context};
687   hipError_t res = tensorflow::wrap::hipHostUnregister(location);
688   if (res != hipSuccess) {
689     LOG(ERROR) << "error unregistering host memory at " << location << ": "
690                << ToString(res);
691     return false;
692   }
693   return true;
694 }
695 
DestroyEvent(GpuContext * context,GpuEventHandle * event)696 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
697                                                   GpuEventHandle* event) {
698   if (*event == nullptr) {
699     return port::Status{port::error::INVALID_ARGUMENT,
700                         "input event cannot be null"};
701   }
702 
703   ScopedActivateContext activated{context};
704   hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
705   *event = nullptr;
706 
707   switch (res) {
708     case hipSuccess:
709       return port::Status::OK();
710     case hipErrorDeinitialized:
711     case hipErrorNotInitialized:
712       return port::Status{
713           port::error::FAILED_PRECONDITION,
714           absl::StrFormat("error destroying ROCM event in device %d: %s",
715                           context->device_ordinal(), ToString(res).c_str())};
716     default:
717       return port::Status{
718           port::error::INTERNAL,
719           absl::StrFormat("error destroying ROCM event in device %d: %s",
720                           context->device_ordinal(), ToString(res).c_str())};
721   }
722 }
723 
RecordEvent(GpuContext * context,GpuEventHandle event,GpuStreamHandle stream)724 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
725                                                  GpuEventHandle event,
726                                                  GpuStreamHandle stream) {
727   ScopedActivateContext activated{context};
728   hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
729   switch (res) {
730     case hipSuccess:
731       return port::Status::OK();
732     case hipErrorDeinitialized:
733     case hipErrorNotInitialized:
734       return port::Status{
735           port::error::FAILED_PRECONDITION,
736           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
737                           ToString(res).c_str())};
738     default:
739       return port::Status{
740           port::error::INVALID_ARGUMENT,
741           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
742                           ToString(res).c_str())};
743   }
744 }
745 
QueryEvent(GpuContext * context,GpuEventHandle event)746 /* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
747     GpuContext* context, GpuEventHandle event) {
748   ScopedActivateContext activated{context};
749   hipError_t res = tensorflow::wrap::hipEventQuery(event);
750   if (res != hipSuccess && res != hipErrorNotReady) {
751     return port::Status{
752         port::error::INTERNAL,
753         absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
754   }
755 
756   return res;
757 }
758 
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,GpuEventHandle start,GpuEventHandle stop)759 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
760                                                  float* elapsed_milliseconds,
761                                                  GpuEventHandle start,
762                                                  GpuEventHandle stop) {
763   ScopedActivateContext activated{context};
764   // The stop event must have completed in order for hipEventElapsedTime to
765   // work.
766   hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
767   if (res != hipSuccess) {
768     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
769     return false;
770   }
771   res =
772       tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
773   if (res != hipSuccess) {
774     LOG(ERROR) << "failed to get elapsed time between events: "
775                << ToString(res);
776     return false;
777   }
778 
779   return true;
780 }
781 
WaitStreamOnEvent(GpuContext * context,GpuStreamHandle stream,GpuEventHandle event)782 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
783                                                GpuStreamHandle stream,
784                                                GpuEventHandle event) {
785   ScopedActivateContext activation{context};
786   hipError_t res =
787       tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
788   if (res != hipSuccess) {
789     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
790     return false;
791   }
792 
793   return true;
794 }
795 
SynchronizeContext(GpuContext * context)796 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
797   ScopedActivateContext activation{context};
798   hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
799   if (res != hipSuccess) {
800     LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
801                << " :: " << port::CurrentStackTrace();
802     return false;
803   }
804 
805   return true;
806 }
807 
SynchronizeStream(GpuContext * context,GpuStreamHandle stream)808 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
809                                                        GpuStreamHandle stream) {
810   ScopedActivateContext activated{context};
811   CHECK(stream != nullptr);
812   RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipStreamSynchronize(stream),
813                        "Could not synchronize on ROCM stream");
814   VLOG(2) << "successfully synchronized stream " << stream << " on device "
815           << context->device_ordinal();
816   return port::Status::OK();
817 }
818 
IsStreamIdle(GpuContext * context,GpuStreamHandle stream)819 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
820                                           GpuStreamHandle stream) {
821   ScopedActivateContext activated{context};
822   CHECK(stream != nullptr);
823   hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
824   if (res == hipSuccess) {
825     return true;
826   }
827 
828   if (res != hipErrorNotReady) {
829     LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
830   }
831   return false;
832 }
833 
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64 size)834 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
835     GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
836   ScopedActivateContext activation{context};
837   RETURN_IF_ROCM_ERROR(
838       tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size),
839       absl::StrFormat("failed to synchronous memcpy from device to host: "
840                       "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
841                       host_dst, absl::bit_cast<void*>(gpu_src), size, size));
842   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
843           << host_dst;
844   return port::Status::OK();
845 }
846 
SynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64 size)847 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
848     GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
849     uint64 size) {
850   ScopedActivateContext activation{context};
851   RETURN_IF_ROCM_ERROR(
852       tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src),
853                                       size),
854       absl::StrFormat(
855           "failed to synchronous memcpy from host to device: Gpu dst: %p;"
856           " host src: %p; size: %llu=0x%llx",
857           absl::bit_cast<void*>(gpu_dst), host_src, size, size));
858   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
859   return port::Status::OK();
860 }
861 
SynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64 size)862 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
863     GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
864     uint64 size) {
865   ScopedActivateContext activation{context};
866   RETURN_IF_ROCM_ERROR(
867       tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size),
868       absl::StrFormat(
869           "failed to synchronous memcpy from host to device:Gpu dst: %p; "
870           "Gpu src: %p; size: %llu=0x%llx",
871           absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
872           size));
873   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
874   return port::Status::OK();
875 }
876 
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64 size,GpuStreamHandle stream)877 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
878                                                    void* host_dst,
879                                                    hipDeviceptr_t gpu_src,
880                                                    uint64 size,
881                                                    GpuStreamHandle stream) {
882   ScopedActivateContext activation{context};
883   hipError_t res =
884       tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
885   if (res != hipSuccess) {
886     LOG(ERROR) << absl::StrFormat(
887         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
888         "Gpu src: %p; size: %llu=0x%llx",
889         ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
890         size);
891     return false;
892   }
893   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
894           << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
895           << host_dst << " on stream " << stream;
896   return true;
897 }
898 
AsynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64 size,GpuStreamHandle stream)899 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
900                                                    hipDeviceptr_t gpu_dst,
901                                                    const void* host_src,
902                                                    uint64 size,
903                                                    GpuStreamHandle stream) {
904   ScopedActivateContext activation{context};
905   hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
906       gpu_dst, const_cast<void*>(host_src), size, stream);
907   if (res != hipSuccess) {
908     LOG(ERROR) << absl::StrFormat(
909         "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
910         "host src: %p; size: %llu=0x%llx",
911         ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
912         size);
913     return false;
914   }
915   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
916           << " on stream " << stream;
917   return true;
918 }
919 
AsynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64 size,GpuStreamHandle stream)920 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
921                                                    hipDeviceptr_t gpu_dst,
922                                                    hipDeviceptr_t gpu_src,
923                                                    uint64 size,
924                                                    GpuStreamHandle stream) {
925   ScopedActivateContext activation{context};
926   hipError_t result =
927       tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
928   if (result != hipSuccess) {
929     LOG(ERROR) << absl::StrFormat(
930         "failed to enqueue async memcpy from device to device: %s"
931         "; Gpu dst: %p on %s %s"
932         "; Gpu src: %p on %s %s"
933         "; can access? %s; size: %llu=0x%llx",
934         ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
935         ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
936         ROCMPointerToDeviceString(gpu_dst).c_str(),
937         absl::bit_cast<void*>(gpu_src),
938         ROCMPointerToMemorySpaceString(gpu_src).c_str(),
939         ROCMPointerToDeviceString(gpu_src).c_str(),
940         ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
941 
942     return false;
943   }
944   VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
945   return true;
946 }
947 
InitEvent(GpuContext * context,GpuEventHandle * event,EventFlags flags)948 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
949                                                GpuEventHandle* event,
950                                                EventFlags flags) {
951   int hipflags;
952   switch (flags) {
953     case EventFlags::kDefault:
954       hipflags = hipEventDefault;
955       break;
956     case EventFlags::kDisableTiming:
957       hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
958       break;
959     default:
960       LOG(FATAL) << "impossible event flags: " << int(hipflags);
961   }
962 
963   ScopedActivateContext activated{context};
964   hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
965 
966   if (res == hipSuccess) {
967     return port::Status::OK();
968   } else if (res == hipErrorMemoryAllocation) {
969     return port::Status{port::error::RESOURCE_EXHAUSTED,
970                         "could not create ROCM event: out of device memory"};
971   } else {
972     return port::Status{
973         port::error::FAILED_PRECONDITION,
974         absl::StrCat("could not create ROCM event: ", ToString(res))};
975   }
976 }
977 
GetDeviceCount()978 /* static */ int GpuDriver::GetDeviceCount() {
979   int device_count = 0;
980   hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
981   if (res != hipSuccess) {
982     LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
983     return 0;
984   }
985 
986   if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
987     device_count = 1;
988   }
989   return device_count;
990 }
991 
GetComputeCapability(int * cc_major,int * cc_minor,hipDevice_t device)992 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
993                                                           int* cc_minor,
994                                                           hipDevice_t device) {
995   return port::Status(
996       port::error::INTERNAL,
997       absl::StrFormat("failed to get compute capability for device: %d "
998                       "(unsupported API on AMD Gpus)",
999                       device));
1000 }
1001 
GetPointerAddressRange(hipDeviceptr_t dptr,hipDeviceptr_t * base,size_t * size)1002 /* static */ port::Status GpuDriver::GetPointerAddressRange(
1003     hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
1004   hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
1005   if (result == hipSuccess) {
1006     return port::Status::OK();
1007   } else if (result == hipErrorNotFound) {
1008     // We differentiate between "this pointer is unknown" (return here) and
1009     // "there was an internal error while performing this operation" (return
1010     // below).
1011     return port::Status{port::error::NOT_FOUND,
1012                         absl::StrFormat("not a device pointer %p; %s",
1013                                         reinterpret_cast<void*>(dptr),
1014                                         ToString(result).c_str())};
1015   }
1016 
1017   return port::Status{
1018       port::error::INTERNAL,
1019       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1020                       reinterpret_cast<void*>(dptr), ToString(result).c_str())};
1021 }
1022 
GetPointerMemorySpace(hipDeviceptr_t pointer)1023 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1024     hipDeviceptr_t pointer) {
1025   unsigned int value;
1026   hipError_t result = hipSuccess;
1027   if (result == hipSuccess) {
1028     switch (value) {
1029       case hipMemoryTypeDevice:
1030         return MemorySpace::kDevice;
1031       case hipMemoryTypeHost:
1032         return MemorySpace::kHost;
1033       default:
1034         return port::Status{
1035             port::error::INTERNAL,
1036             absl::StrCat("unknown memory space provided by ROCM API: ", value)};
1037     }
1038   }
1039 
1040   return port::Status{
1041       port::error::INTERNAL,
1042       absl::StrCat("failed to query device pointer for memory space: ",
1043                    ToString(result))};
1044 }
1045 
GetPointerDevice(hipDeviceptr_t pointer)1046 /* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
1047     hipDeviceptr_t pointer) {
1048   hipPointerAttribute_t pointerAttributes;
1049   hipError_t result =
1050       tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
1051   if (result != hipSuccess) {
1052     return port::Status{
1053         port::error::INTERNAL,
1054         absl::StrCat("failed to get device for pointer: ", ToString(result))};
1055   }
1056 
1057   hipDevice_t device;
1058   result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
1059   if (result != hipSuccess) {
1060     return port::Status{
1061         port::error::INTERNAL,
1062         absl::StrCat("failed to get device for pointer: ", ToString(result))};
1063   }
1064 
1065   return device;
1066 }
1067 
GetGpuISAVersion(int * version,hipDevice_t device)1068 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1069                                                       hipDevice_t device) {
1070   hipDeviceProp_t props;
1071   hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1072   if (result == hipSuccess) {
1073     *version = props.gcnArch;
1074     return port::Status::OK();
1075   }
1076   *version = 0;
1077   return port::Status{
1078       port::error::INTERNAL,
1079       absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
1080                       device)};
1081 }
1082 
GetGpuGCNArchName(hipDevice_t device,std::string * gcnArchName)1083 /* static */ port::Status GpuDriver::GetGpuGCNArchName(
1084     hipDevice_t device, std::string* gcnArchName) {
1085   hipDeviceProp_t props;
1086   hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1087   if (result == hipSuccess) {
1088     *gcnArchName = props.gcnArchName;
1089     return port::Status::OK();
1090   }
1091   *gcnArchName = "";
1092   return port::Status{
1093       port::error::INTERNAL,
1094       absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
1095                       device)};
1096 }
1097 
1098 // Helper function that turns the integer output of hipDeviceGetAttribute to
1099 // type T and wraps it in a StatusOr.
1100 template <typename T>
GetSimpleAttribute(hipDevice_t device,hipDeviceAttribute_t attribute)1101 static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
1102                                             hipDeviceAttribute_t attribute) {
1103   int value = -1;
1104   hipError_t result =
1105       tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
1106   if (result != hipSuccess) {
1107     return port::Status{
1108         port::error::NOT_FOUND,
1109         absl::StrCat("could not retrieve ROCM device attribute (", attribute,
1110                      "): ", ToString(result))};
1111   }
1112   T converted = value;
1113   return converted;
1114 }
1115 
GetMultiprocessorCount(hipDevice_t device)1116 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1117     hipDevice_t device) {
1118   return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
1119 }
1120 
GetMaxSharedMemoryPerCore(hipDevice_t device)1121 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1122     hipDevice_t device) {
1123   return GetSimpleAttribute<int64>(
1124       device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
1125 }
1126 
GetMaxSharedMemoryPerBlock(hipDevice_t device)1127 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1128     hipDevice_t device) {
1129   return GetSimpleAttribute<int64>(device,
1130                                    hipDeviceAttributeMaxSharedMemoryPerBlock);
1131 }
1132 
GetMaxThreadsPerMultiprocessor(hipDevice_t device)1133 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1134     hipDevice_t device) {
1135   return GetSimpleAttribute<int64>(
1136       device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
1137 }
1138 
GetMaxThreadsPerBlock(hipDevice_t device)1139 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1140     hipDevice_t device) {
1141   return GetSimpleAttribute<int64>(device,
1142                                    hipDeviceAttributeMaxThreadsPerBlock);
1143 }
1144 
GetMaxRegistersPerBlock(hipDevice_t device)1145 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1146     hipDevice_t device) {
1147   return GetSimpleAttribute<int64>(device,
1148                                    hipDeviceAttributeMaxRegistersPerBlock);
1149 }
1150 
GetThreadsPerWarp(hipDevice_t device)1151 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1152     hipDevice_t device) {
1153   return GetSimpleAttribute<int64>(device, hipDeviceAttributeWarpSize);
1154 }
1155 
GetGridLimits(int * x,int * y,int * z,hipDevice_t device)1156 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1157                                            hipDevice_t device) {
1158   int value;
1159   hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
1160       &value, hipDeviceAttributeMaxGridDimX, device);
1161   if (res != hipSuccess) {
1162     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1163     return false;
1164   }
1165   *x = value;
1166 
1167   res = tensorflow::wrap::hipDeviceGetAttribute(
1168       &value, hipDeviceAttributeMaxGridDimY, device);
1169   if (res != hipSuccess) {
1170     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1171     return false;
1172   }
1173   *y = value;
1174 
1175   res = tensorflow::wrap::hipDeviceGetAttribute(
1176       &value, hipDeviceAttributeMaxGridDimZ, device);
1177   if (res != hipSuccess) {
1178     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1179     return false;
1180   }
1181   *z = value;
1182   return true;
1183 }
1184 
GetDriverVersion(int * driver_version)1185 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1186   hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
1187   if (res != hipSuccess) {
1188     LOG(ERROR) << "failed to query driver version: " << ToString(res);
1189     return false;
1190   }
1191 
1192   return true;
1193 }
1194 
GetDeviceProperties(hipDeviceProp_t * device_properties,int device_ordinal)1195 /* static */ bool GpuDriver::GetDeviceProperties(
1196     hipDeviceProp_t* device_properties, int device_ordinal) {
1197   hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
1198                                                             device_ordinal);
1199   if (res != hipSuccess) {
1200     LOG(ERROR) << "failed to query device properties: " << ToString(res);
1201     return false;
1202   }
1203 
1204   return true;
1205 }
1206 
GetDeviceAttribute(hipDeviceAttribute_t attribute,hipDevice_t device)1207 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1208     hipDeviceAttribute_t attribute, hipDevice_t device) {
1209   return GetSimpleAttribute<int>(device, attribute);
1210 }
1211 
IsEccEnabled(hipDevice_t device,bool * result)1212 /* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
1213   int value = -1;
1214   hipError_t res = hipSuccess;
1215   // TODO(ROCm) implement this feature in HIP
1216   if (res != hipSuccess) {
1217     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1218     return false;
1219   }
1220 
1221   *result = value;
1222   return true;
1223 }
1224 
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1225 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1226                                                  int64* free_out,
1227                                                  int64* total_out) {
1228   ScopedActivateContext activation{context};
1229   size_t free = 0;
1230   size_t total = 0;
1231   hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
1232   if (res != hipSuccess) {
1233     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1234     return false;
1235   }
1236 
1237   *free_out = free;
1238   *total_out = total;
1239   return true;
1240 }
1241 
GetDeviceTotalMemory(hipDevice_t device,uint64 * result)1242 /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
1243                                                   uint64* result) {
1244   size_t value = -1;
1245   hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
1246   if (res != hipSuccess) {
1247     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1248     return false;
1249   }
1250 
1251   *result = value;
1252   return true;
1253 }
1254 
GetPCIBusID(hipDevice_t device)1255 /* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
1256   string pci_bus_id;
1257   static const int kBufferSize = 64;
1258   absl::InlinedVector<char, 4> chars(kBufferSize);
1259   chars[kBufferSize - 1] = '\0';
1260   hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
1261       chars.begin(), kBufferSize - 1, device);
1262   if (res != hipSuccess) {
1263     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1264     return pci_bus_id;
1265   }
1266   pci_bus_id = chars.begin();
1267   return pci_bus_id;
1268 }
1269 
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1270 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1271                                                  GpuContext* to) {
1272   if (from->device_ordinal() == to->device_ordinal()) {
1273     return true;  // A device can always access its own memory.
1274   }
1275 
1276   int can_access_peer = -1;
1277   hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
1278       &can_access_peer, from->device_ordinal(), to->device_ordinal());
1279   if (res != hipSuccess) {
1280     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
1281     return false;
1282   }
1283 
1284   return can_access_peer;
1285 }
1286 
EnablePeerAccess(GpuContext * from,GpuContext * to)1287 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1288                                                       GpuContext* to) {
1289   if (from->device_ordinal() == to->device_ordinal()) {
1290     return port::Status::OK();  // A device can always access its own memory.
1291   }
1292 
1293   ScopedActivateContext activated{from};
1294   hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
1295       to->device_ordinal(), 0 /* = flags */);
1296   if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
1297     return port::Status{
1298         port::error::INTERNAL,
1299         absl::StrFormat("failed to enable peer access from %d to %d: %s",
1300                         from->device_ordinal(), to->device_ordinal(),
1301                         ToString(result).c_str())};
1302   }
1303 
1304   return port::Status::OK();
1305 }
1306 
GetMaxOccupiedBlocksPerCore(GpuContext * context,hipFunction_t kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1307 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1308     GpuContext* context, hipFunction_t kernel, int threads_per_block,
1309     size_t dynamic_shared_memory_bytes) {
1310   ScopedActivateContext activation{context};
1311 
1312   int max_blocks = 0;
1313   hipError_t result = hipSuccess;
1314   // TODO(ROCm) implement this feature in HIP
1315   if (result != hipSuccess) {
1316     return port::Status{
1317         port::error::INTERNAL,
1318         absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
1319                         kernel, ToString(result).c_str())};
1320   }
1321 
1322   return max_blocks;
1323 }
1324 
1325 }  // namespace gpu
1326 }  // namespace stream_executor
1327