1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // CUDA userspace driver library wrapper functionality. 17 18 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_ 19 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_ 20 21 #include <stddef.h> 22 23 #include "tensorflow/stream_executor/device_options.h" 24 #include "tensorflow/stream_executor/gpu/gpu_types.h" 25 #include "tensorflow/stream_executor/lib/status.h" 26 #include "tensorflow/stream_executor/lib/statusor.h" 27 #include "tensorflow/stream_executor/platform/port.h" 28 29 namespace stream_executor { 30 namespace gpu { 31 32 // Identifies the memory space where an allocation resides. See 33 // GpuDriver::GetPointerMemorySpace(). 34 enum class MemorySpace { kHost, kDevice }; 35 36 // Returns a casual string, such as "host" for the provided memory space. 37 std::string MemorySpaceString(MemorySpace memory_space); 38 39 class GpuContext; 40 41 // GpuDriver contains wrappers for calls to the userspace library driver. It's 42 // useful to isolate these calls and put basic wrappers around them to separate 43 // userspace library driver behaviors from the rest of the program. 44 // 45 // At the moment it's simply used as a namespace. 46 // 47 // The calls log any specific errors internally and return whether the operation 48 // was successful to the caller. 49 // 50 // The order of parameters is generally kept symmetric with the underlying CUDA 51 // driver API. 52 // 53 // Links on functions are to specific documentation under 54 // http://docs.nvidia.com/cuda/cuda-driver-api/ 55 // 56 // Thread safety: these functions should not be used from signal handlers. 57 class GpuDriver { 58 public: 59 // Wraps a call to cuInit with logging to help indicate what has gone wrong in 60 // the case of failure. Safe to call multiple times; will be fast on all calls 61 // after the first. 62 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3 63 static port::Status Init(); 64 65 // Returns the device associated with the given context. 66 // device is an outparam owned by the caller, must not be null. 67 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e 68 static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context); 69 70 // Creates a new CUDA stream associated with the given context via 71 // cuStreamCreate. 72 // stream is an outparam owned by the caller, must not be null. 73 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4 74 static bool CreateStream(GpuContext* context, GpuStreamHandle* stream, 75 int priority = 0); 76 77 // Destroys a CUDA stream associated with the given context. 78 // stream is owned by the caller, must not be null, and *stream is set to null 79 // if the stream is successfully destroyed. 80 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758 81 static void DestroyStream(GpuContext* context, GpuStreamHandle* stream); 82 83 // CUDA events can explicitly disable event TSC retrieval for some presumed 84 // performance improvement if timing is unnecessary. 85 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db 86 enum class EventFlags { kDefault, kDisableTiming }; 87 88 // Creates a new event associated with the given context. 89 // result is an outparam owned by the caller and must not be null. 90 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db 91 static port::Status InitEvent(GpuContext* context, GpuEventHandle* result, 92 EventFlags flags); 93 94 // Destroys *event and turns it into a nullptr. event may not be null, but 95 // *event may be, via cuEventDestroy 96 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef 97 static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event); 98 99 // Allocates a GPU memory space of size bytes associated with the given 100 // context via cuMemAlloc. 101 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467 102 static void* DeviceAllocate(GpuContext* context, uint64 bytes); 103 104 // Deallocates a GPU memory space of size bytes associated with the given 105 // context via cuMemFree. 106 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a 107 static void DeviceDeallocate(GpuContext* context, void* location); 108 109 // Allocates a unified memory space of size bytes associated with the given 110 // context via cuMemAllocManaged. 111 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32 112 // (supported on CUDA only) 113 static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes); 114 115 // Deallocates a unified memory space of size bytes associated with the given 116 // context via cuMemFree. 117 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a 118 // (supported on CUDA only) 119 static void UnifiedMemoryDeallocate(GpuContext* context, void* location); 120 121 // Allocates page-locked and CUDA-registered memory on the host via 122 // cuMemAllocHost. 123 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0 124 static void* HostAllocate(GpuContext* context, uint64 bytes); 125 126 // Deallocates a location created by HostAllocate, via cuMemFreeHost. 127 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c 128 static void HostDeallocate(GpuContext* context, void* location); 129 130 // Registers a memory region at location of size bytes via cuMemHostRegister. 131 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223 132 static bool HostRegister(GpuContext* context, void* location, uint64 bytes); 133 134 // Unregisters a memory region that was previously registered at location via 135 // cuMemHostUnregister. 136 // 137 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14 138 // 139 // TODO(leary) verify an error will be returned if the location wasn't 140 // previously registered. 141 static bool HostUnregister(GpuContext* context, void* location); 142 143 // Virtual memory support was added to CUDA in 10.2 144 #if CUDA_VERSION >= 10020 145 146 // Reserves a range of virtual device memory addresses via 147 // cuMemAddressReserve. bytes must be a multiple of the host page size. 148 // Returns nullptr base address in VmemSpan if the reservation fails. 149 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1ge489256c107df2a07ddf96d80c86cd9b 150 struct VmemSpan { 151 GpuDevicePtr base; 152 // Size in bytes. 153 uint64 size_bytes; 154 }; 155 static port::StatusOr<VmemSpan> ReserveVirtualMemory(GpuContext* context, 156 uint64 bytes); 157 158 // Frees a range of virtual addresses that were previously reserved through 159 // ReserveVirtualMemory via cuMemAddressFree. 160 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g6993ecea2ea03e1b802b8255edc2da5b 161 static void FreeVirtualMemory(GpuContext* context, VmemSpan reservation); 162 163 // Calculates the minimum alignment for memory allocations done through 164 // cuMemCreate via cuMemGetAllocationGranularity. 165 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a 166 static port::StatusOr<uint64> GetMinAllocationGranularity( 167 GpuDeviceHandle device); 168 169 // Allocates physical memory and returns a handle that can be mapped to 170 // virtual addresses via cuMemCreate. bytes must be a multiple of the 171 // granularity returned by GetMinAllocationGranularity. 172 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c 173 struct GenericMemoryHandle { 174 uint64 handle; 175 uint64 bytes; 176 }; 177 static port::StatusOr<GenericMemoryHandle> CreateMemoryHandle( 178 GpuContext* context, uint64 bytes); 179 180 // Frees memory represented by the provided MemoryHandle via cuMemRelease. 181 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g3014f0759f43a8d82db951b8e4b91d68 182 static void ReleaseMemoryHandle(GpuContext* context, 183 GenericMemoryHandle handle); 184 185 // Maps a memory allocation handle to a reserved virtual address range via 186 // cuMemMap and sets the appropriate access settings via cuMemSetAccess. 187 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56 188 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1 189 static port::Status MapMemory( 190 GpuContext* context, GpuDevicePtr va, const GenericMemoryHandle& handle, 191 const std::vector<GpuDeviceHandle>& device_handles); 192 193 // Unmaps the backing memory from the given virtual address range. This range 194 // must fully unmap a memory handle that was mapped using MapMemory; partial 195 // unmapping is not supported. 196 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gfb50aac00c848fd7087e858f59bf7e2a 197 static void UnmapMemory(GpuContext* context, GpuDevicePtr va, uint64 bytes); 198 199 #endif // CUDA_VERSION >= 10200 200 201 // Given a device ordinal, returns a device handle into the device outparam, 202 // which must not be null. 203 // 204 // N.B. these device handles do not have a corresponding destroy function in 205 // the CUDA driver API. 206 static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device); 207 208 // Given a device handle, returns the name reported by the driver for the 209 // device. 210 static port::Status GetDeviceName(GpuDeviceHandle device, 211 std::string* device_name); 212 213 // Given a device to create a context for, returns a context handle into the 214 // context outparam, which must not be null. 215 // 216 // N.B. CUDA contexts are weird. They are implicitly associated with the 217 // calling thread. Current documentation on contexts and their influence on 218 // userspace processes is given here: 219 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf 220 static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device, 221 const DeviceOptions& device_options, 222 GpuContext** context); 223 224 // Destroys the provided context via cuCtxDestroy. 225 // Don't do this while clients could still be using the context, per the docs 226 // bad things will happen. 227 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e 228 static void DestroyContext(GpuContext* context); 229 230 // Queries the runtime for the specified attribute of the specified function. 231 // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates 232 // in terms of integer-sized values, so there's no potential for overrun (as 233 // of CUDA 5.5). 234 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b 235 static port::Status FuncGetAttribute(GpuFunctionAttribute attribute, 236 GpuFunctionHandle function, 237 int* attribute_value); 238 239 // Sets the preferred cache configuration for the specified function. 240 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681 241 static port::Status FuncSetCacheConfig(GpuFunctionHandle function, 242 GpuFuncCachePreference cache_config); 243 244 // Gets the preferred shared memory bank configuration for the specified 245 // CONTEXT (not function!), either default or four- or eight-byte bank size. 246 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74 247 static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig( 248 GpuContext* context); 249 250 // Sets the preferred shared memory bank configuration for the specified 251 // CONTEXT (not function!), either default or four- or eight-byte bank size. 252 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692 253 static port::Status ContextSetSharedMemConfig( 254 GpuContext* context, GpuSharedMemConfig shared_mem_config); 255 256 // Launches a CUDA kernel via cuLaunchKernel. 257 // TODO(leary) describe the structure of kernel_params and extra in a readable 258 // way. 259 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15 260 static port::Status LaunchKernel( 261 GpuContext* context, GpuFunctionHandle function, unsigned int grid_dim_x, 262 unsigned int grid_dim_y, unsigned int grid_dim_z, 263 unsigned int block_dim_x, unsigned int block_dim_y, 264 unsigned int block_dim_z, unsigned int shared_mem_bytes, 265 GpuStreamHandle stream, void** kernel_params, void** extra); 266 267 // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting 268 // handle in "module". Any error logs that are produced are logged internally. 269 // (supported on CUDA only) 270 static port::Status LoadPtx(GpuContext* context, const char* ptx_contents, 271 GpuModuleHandle* module); 272 273 // Loads cubin_bytes with the CUDA driver's blob loading interface and stores 274 // the resulting handle in "module". 275 // (supported on CUDA only) 276 static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes, 277 GpuModuleHandle* module); 278 279 // Loads HSACO with the ROCM runtime and stores the resulting handle in 280 // "module". Any error logs that are produced are logged internally. 281 // (supported on ROCm only) 282 static port::Status LoadHsaco(GpuContext* context, const char* hsaco_contents, 283 GpuModuleHandle* module); 284 285 // Retrieves a named kernel from a loaded module, and places the resulting 286 // handle into function (outparam) on success. Neither kernel_name nor 287 // function may be null. No ownership is taken of kernel_name. 288 static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module, 289 const char* kernel_name, 290 GpuFunctionHandle* function); 291 292 // Retrieves a named global/constant symbol from a loaded module, and returns 293 // a device pointer and size of the symbol on success. symbol_name may not be 294 // null. At least one of dptr or bytes should not be null. No ownership is 295 // taken of symbol_name. 296 static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module, 297 const char* symbol_name, GpuDevicePtr* dptr, 298 size_t* bytes); 299 300 // Unloads module from the current context via cuModuleUnload. 301 // TODO(leary) the documentation doesn't say what kind of disasters happen 302 // if you try to unload a module while its GpuFunctionHandles are in use. 303 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b 304 static void UnloadModule(GpuContext* context, GpuModuleHandle module); 305 306 // Performs a synchronous memset of the device memory segment via cuMemsetD8. 307 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b 308 static port::Status SynchronousMemsetUint8(GpuContext* context, 309 GpuDevicePtr location, uint8 value, 310 size_t size); 311 312 // Performs a synchronous memset of the device memory segment via cuMemsetD32. 313 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132 314 static port::Status SynchronousMemsetUint32(GpuContext* context, 315 GpuDevicePtr location, 316 uint32 value, 317 size_t uint32_count); 318 319 // Performs an asynchronous memset of the device memory segment via 320 // cuMemsetD8Async. 321 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627 322 static port::Status AsynchronousMemsetUint8(GpuContext* context, 323 GpuDevicePtr location, 324 uint8 value, size_t uint32_count, 325 GpuStreamHandle stream); 326 327 // Performs an asynchronous memset of the device memory segment via 328 // cuMemsetD32Async. 329 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5 330 static port::Status AsynchronousMemsetUint32(GpuContext* context, 331 GpuDevicePtr location, 332 uint32 value, 333 size_t uint32_count, 334 GpuStreamHandle stream); 335 336 // -- Synchronous memcopies. 337 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169 338 339 static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst, 340 GpuDevicePtr gpu_src, uint64 size); 341 static port::Status SynchronousMemcpyH2D(GpuContext* context, 342 GpuDevicePtr gpu_dst, 343 const void* host_src, uint64 size); 344 static port::Status SynchronousMemcpyD2D(GpuContext* context, 345 GpuDevicePtr gpu_dst, 346 GpuDevicePtr gpu_src, uint64 size); 347 348 // -- Asynchronous memcopies. 349 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362 350 351 static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst, 352 GpuDevicePtr gpu_src, uint64 size, 353 GpuStreamHandle stream); 354 static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst, 355 const void* host_src, uint64 size, 356 GpuStreamHandle stream); 357 static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst, 358 GpuDevicePtr gpu_src, uint64 size, 359 GpuStreamHandle stream); 360 361 // The CUDA stream callback type signature. 362 // The data passed to AddStreamCallback is subsequently passed to this 363 // callback when it fires. 364 // 365 // Some notable things: 366 // * Callbacks must not make any CUDA API calls. 367 // * Callbacks from independent streams execute in an undefined order and may 368 // be serialized. 369 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483 370 typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status, 371 void* data); 372 373 // Enqueues a callback operation into stream. 374 // See StreamCallback above and the NVIDIA documentation for additional 375 // details. 376 static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream, 377 StreamCallback callback, void* data); 378 379 // Causes stream to wait for event to trigger before proceeding via 380 // cuStreamWaitEvent. 381 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM 382 static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream, 383 GpuEventHandle event); 384 385 // Blocks the calling thread until the operations enqueued onto stream have 386 // been completed, via cuStreamSynchronize. 387 // 388 // TODO(leary) if a pathological thread enqueues operations onto the stream 389 // while another thread blocks like this, can you wind up waiting an unbounded 390 // amount of time? 391 // 392 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad 393 static port::Status SynchronizeStream(GpuContext* context, 394 GpuStreamHandle stream); 395 396 // Blocks the calling thread until the operations associated with the context 397 // have been completed, via cuCtxSynchronize. 398 // 399 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616 400 static bool SynchronizeContext(GpuContext* context); 401 402 // Returns true if all stream tasks have completed at time of the call. Note 403 // the potential for races around this call (if another thread adds work to 404 // the stream immediately after this returns). 405 static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream); 406 407 // Returns whether code in the from context can access memory in the to 408 // context via cuDeviceCanAccessPeer. 409 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e 410 static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to); 411 412 // Returns whether the from device can access memory in the to 413 // device via cuDeviceCanAccessPeer. Because of differences between ROCM and 414 // CUDA, this API is not supported in ROCM builds and will result in a link 415 // error if used. 416 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e 417 static bool CanEnablePeerAccess(GpuDeviceHandle from, GpuDeviceHandle to); 418 419 // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess. 420 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a 421 static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to); 422 423 // Returns the elapsed milliseconds between start and stop via 424 // cuEventElapsedTime. 425 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97 426 static bool GetEventElapsedTime(GpuContext* context, 427 float* elapsed_milliseconds, 428 GpuEventHandle start, GpuEventHandle stop); 429 430 // Records that an event occurred when execution reaches the current point in 431 // thestream via cuEventRecord. 432 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1 433 static port::Status RecordEvent(GpuContext* context, GpuEventHandle event, 434 GpuStreamHandle stream); 435 436 // Polls (without blocking) to determine the status of an event - pending or 437 // complete (or an error status). 438 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef 439 static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context, 440 GpuEventHandle event); 441 442 // -- Pointer-specific calls. 443 444 // Returns the context in which pointer was allocated or registered. 445 static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer); 446 447 // Returns the device associated with the context from GetPointerContext(). 448 static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer); 449 450 // Returns the memory space addressed by pointer. 451 static port::StatusOr<MemorySpace> GetPointerMemorySpace( 452 GpuDevicePtr pointer); 453 454 // Returns the base address and size of the device pointer dptr. 455 static port::Status GetPointerAddressRange(GpuDevicePtr dptr, 456 GpuDevicePtr* base, size_t* size); 457 458 // -- Device-specific calls. 459 460 // Returns the compute capability for the device; i.e (3, 5). 461 // This is currently done via the deprecated device API. 462 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea 463 // (supported on CUDA only) 464 static port::Status GetComputeCapability(int* cc_major, int* cc_minor, 465 GpuDeviceHandle device); 466 467 // Returns Gpu ISA version for the device; i.e 803, 900. 468 // (supported on ROCm only) 469 static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device); 470 471 // Return the full GCN Architecture Name for the the device 472 // for eg: amdgcn-amd-amdhsa--gfx908:sramecc+:xnack- 473 // (supported on ROCm only) 474 static port::Status GetGpuGCNArchName(GpuDeviceHandle device, 475 std::string* gcnArchName); 476 477 // Returns the number of multiprocessors on the device (note that the device 478 // may be multi-GPU-per-board). 479 static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device); 480 481 // Returns the limit on number of threads that can be resident in a single 482 // multiprocessor. 483 static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor( 484 GpuDeviceHandle device); 485 486 // Returns the limit on number of threads which may be resident for a single 487 // block (cooperative thread array). 488 static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device); 489 490 // Returns the amount of shared memory available on a single GPU core (i.e. 491 // SM on NVIDIA devices). 492 static port::StatusOr<int64> GetMaxSharedMemoryPerCore( 493 GpuDeviceHandle device); 494 495 // Returns the amount of shared memory available for a single block 496 // (cooperative thread array). 497 static port::StatusOr<int64> GetMaxSharedMemoryPerBlock( 498 GpuDeviceHandle device); 499 500 // Returns the maximum supported number of registers per block. 501 static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device); 502 503 // Returns the number of threads per warp. 504 static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device); 505 506 // Queries the grid limits for device with cuDeviceGetAttribute calls. 507 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 508 static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device); 509 510 // Returns a grab-bag of device properties in a caller-owned device_properties 511 // structure for device_ordinal via cuDeviceGetProperties. 512 // 513 // This call is deprecated in the NVIDIA driver API; its replacement is 514 // GetDeviceAttribute 515 // 516 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6 517 static bool GetDeviceProperties(GpuDeviceProperty* device_properties, 518 int device_ordinal); 519 520 // Gets a specific integer-valued property about the given device. 521 // 522 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 523 static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute, 524 GpuDeviceHandle device); 525 526 // Returns whether ECC is enabled for the given GpuDeviceHandle via 527 // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED. 528 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 529 static bool IsEccEnabled(GpuDeviceHandle device, bool* result); 530 531 // Returns the total amount of memory available for allocation by the CUDA 532 // context, in bytes, via cuDeviceTotalMem. 533 static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result); 534 535 // Returns the free amount of memory and total amount of memory, as reported 536 // by cuMemGetInfo. 537 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0 538 static bool GetDeviceMemoryInfo(GpuContext* context, int64* free, 539 int64* total); 540 541 // Returns a PCI bus id string for the device. 542 // [domain]:[bus]:[device].[function] 543 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc 544 static std::string GetPCIBusID(GpuDeviceHandle device); 545 546 // -- Context- and device-independent calls. 547 548 // Returns the number of visible CUDA device via cuDeviceGetCount. 549 // This should correspond to the set of device ordinals available. 550 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74 551 static int GetDeviceCount(); 552 553 // Returns the driver version number via cuDriverGetVersion. 554 // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but, 555 // instead, the CUDA toolkit release number that this driver is compatible 556 // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5 557 // compatible driver). 558 // 559 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71 560 static bool GetDriverVersion(int* driver_version); 561 562 // -- Other calls 563 564 // Returns the maximum number of blocks (per multiprocessor) occupied by the 565 // specified kernel/GpuFunctionHandle when launched with the specified 566 // parameters. 567 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98 568 static port::StatusOr<int> GetMaxOccupiedBlocksPerCore( 569 GpuContext* context, GpuFunctionHandle kernel, int threads_per_block, 570 size_t dynamic_shared_memory_bytes); 571 572 // Seam for injecting an error at CUDA initialization time for testing 573 // purposes. 574 static bool driver_inject_init_error_; 575 }; 576 577 // Ensures a context is activated within a scope. 578 class ScopedActivateContext { 579 public: 580 // Activates the context via cuCtxSetCurrent, if it is not the currently 581 // active context (a la cuCtxGetCurrent). Note the alternative push/pop 582 // mechanism is said by NVIDIA to be relatively slow and deprecated. 583 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7 584 explicit ScopedActivateContext(GpuContext* context); 585 586 // Checks that the context has remained activated for the duration of the 587 // scope. 588 ~ScopedActivateContext(); 589 590 private: 591 GpuContext* to_restore_ = nullptr; 592 }; 593 594 } // namespace gpu 595 } // namespace stream_executor 596 597 #endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_ 598