1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // The CUDA implementation of the StreamExecutorInterface functionality. 17 // CUDA inclusions are ideally confined to this implementation file. 18 // 19 // The notions from the StreamExecutor basically correspond to the CUDA streams 20 // programming model provided by the libcuda.so driver APIs, so we don't have 21 // to do much more than wrap the calls to the libraries appropriately. 22 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_ 23 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_ 24 25 #include <set> 26 #include <unordered_map> 27 28 #include "absl/strings/string_view.h" 29 #include "absl/synchronization/mutex.h" 30 #include "tensorflow/core/platform/thread_annotations.h" 31 #include "tensorflow/stream_executor/event.h" 32 #include "tensorflow/stream_executor/gpu/gpu_kernel.h" 33 #include "tensorflow/stream_executor/lib/status.h" 34 #include "tensorflow/stream_executor/lib/statusor.h" 35 #include "tensorflow/stream_executor/platform.h" 36 #include "tensorflow/stream_executor/platform/port.h" 37 #include "tensorflow/stream_executor/stream_executor_internal.h" 38 39 namespace stream_executor { 40 namespace gpu { 41 42 // CUDA-platform implementation of the platform-agnostic 43 // StreamExecutorInterface. 44 class GpuExecutor : public internal::StreamExecutorInterface { 45 public: 46 // sub_platform indicates the subplatform used in this executor; it must 47 // be a CUDA type. GpuExecutor(const PluginConfig & plugin_config)48 explicit GpuExecutor(const PluginConfig& plugin_config) 49 : device_(0), 50 context_(nullptr), 51 device_ordinal_(0), 52 cc_major_(0), 53 cc_minor_(0), 54 version_(0), 55 plugin_config_(plugin_config) {} 56 57 // See the corresponding StreamExecutor methods for method comments on the 58 // following overrides. 59 60 ~GpuExecutor() override; 61 62 port::Status Init(int device_ordinal, DeviceOptions device_options) override; 63 64 port::Status GetKernel(const MultiKernelLoaderSpec& spec, 65 KernelBase* kernel) override; 66 // (supported on CUDA only) 67 void UnloadKernel(const KernelBase* kernel) override; 68 port::Status LoadModule(const MultiModuleLoaderSpec& spec, 69 ModuleHandle* module_handle) override; 70 bool UnloadModule(ModuleHandle module_handle) override; 71 72 port::Status Launch(Stream* stream, const ThreadDim& thread_dims, 73 const BlockDim& block_dims, const KernelBase& k, 74 const KernelArgsArrayBase& args) override; 75 76 // (supported on CUDA only) 77 int CalculateOccupancy(const DeviceDescription& device_description, 78 uint64 registers_per_thread, 79 uint64 shared_memory_per_block, 80 const ThreadDim& thread_dims, GpuFunctionHandle func); 81 82 // (supported on CUDA only) 83 int CompareOccupancy(int* initial_blocks, 84 const DeviceDescription& device_description, 85 uint64 registers_per_thread, 86 uint64 shared_memory_per_block, 87 const ThreadDim& thread_dims, GpuFunctionHandle func); 88 89 DeviceMemoryBase Allocate(uint64 size, int64 memory_space) override; 90 91 void* GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes, 92 uint64 size_bytes) override; 93 94 void Deallocate(DeviceMemoryBase* mem) override; 95 UnifiedMemoryAllocate(uint64 size)96 void* UnifiedMemoryAllocate(uint64 size) override { 97 return GpuDriver::UnifiedMemoryAllocate(context_, size); 98 } 99 UnifiedMemoryDeallocate(void * location)100 void UnifiedMemoryDeallocate(void* location) override { 101 return GpuDriver::UnifiedMemoryDeallocate(context_, location); 102 } 103 104 // CUDA allocation/registration functions are necessary because the driver 105 // internally sets up buffers for DMA operations (and page locks them). 106 // There's no external interface for us to otherwise control these DMA 107 // settings. HostMemoryAllocate(uint64 size)108 void* HostMemoryAllocate(uint64 size) override { 109 return GpuDriver::HostAllocate(context_, size); 110 } 111 HostMemoryDeallocate(void * location)112 void HostMemoryDeallocate(void* location) override { 113 return GpuDriver::HostDeallocate(context_, location); 114 } 115 116 bool HostMemoryRegister(void* location, uint64 size) override; 117 118 bool HostMemoryUnregister(void* location) override; 119 120 bool SynchronizeAllActivity() override; 121 122 port::Status SynchronousMemZero(DeviceMemoryBase* location, 123 uint64 size) override; 124 125 port::Status SynchronousMemSet(DeviceMemoryBase* location, int value, 126 uint64 size) override; 127 128 port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst, 129 const void* host_src, uint64 size) override; 130 131 port::Status SynchronousMemcpy(void* host_dst, 132 const DeviceMemoryBase& gpu_src, 133 uint64 size) override; 134 135 port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst, 136 const DeviceMemoryBase& gpu_src, 137 uint64 size) override; 138 139 port::Status MemZero(Stream* stream, DeviceMemoryBase* location, 140 uint64 size) override; 141 port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern, 142 uint64 size) override; 143 port::Status Memset32(Stream* stream, DeviceMemoryBase* location, 144 uint32 pattern, uint64 size) override; 145 146 bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src, 147 uint64 size) override; 148 149 bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src, 150 uint64 size) override; 151 152 bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst, 153 const DeviceMemoryBase& gpu_src, 154 uint64 size) override; 155 156 bool HostCallback(Stream* stream, 157 std::function<port::Status()> callback) override; 158 159 bool AllocateStream(Stream* stream) override; 160 161 void DeallocateStream(Stream* stream) override; 162 163 bool CreateStreamDependency(Stream* dependent, Stream* other) override; 164 165 bool AllocateTimer(Timer* timer) override; 166 167 void DeallocateTimer(Timer* timer) override; 168 169 bool StartTimer(Stream* stream, Timer* timer) override; 170 171 bool StopTimer(Stream* stream, Timer* timer) override; 172 173 port::Status AllocateEvent(Event* event) override; 174 175 port::Status DeallocateEvent(Event* event) override; 176 177 port::Status RecordEvent(Stream* stream, Event* event) override; 178 179 port::Status WaitForEvent(Stream* stream, Event* event) override; 180 181 Event::Status PollForEventStatus(Event* event) override; 182 183 port::Status BlockHostUntilDone(Stream* stream) override; 184 PlatformDeviceCount()185 int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); } 186 187 port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override; 188 189 bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override; 190 191 bool DeviceMemoryUsage(int64* free, int64* total) const override; 192 193 // Search for the symbol and returns a device pointer and size. 194 // Returns false if symbol does not exist. 195 bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle, 196 void** mem, size_t* bytes) override; 197 CreateDeviceDescription()198 port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription() 199 const override { 200 return CreateDeviceDescription(device_ordinal_); 201 } 202 203 static port::StatusOr<std::unique_ptr<DeviceDescription>> 204 CreateDeviceDescription(int device_ordinal); 205 206 bool SupportsBlas() const override; 207 208 blas::BlasSupport* CreateBlas() override; 209 210 bool SupportsFft() const override; 211 212 fft::FftSupport* CreateFft() override; 213 214 bool SupportsRng() const override; 215 216 rng::RngSupport* CreateRng() override; 217 218 bool SupportsDnn() const override; 219 220 dnn::DnnSupport* CreateDnn() override; 221 222 std::unique_ptr<internal::EventInterface> CreateEventImplementation() 223 override; 224 225 std::unique_ptr<internal::KernelInterface> CreateKernelImplementation() 226 override; 227 228 std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override; 229 230 std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override; 231 232 void* GpuContextHack() override; 233 234 GpuContext* gpu_context(); 235 236 private: 237 // Attempts to find a more specific version of the file indicated by 238 // filename by looking for compute-capability-specific suffixed versions; i.e. 239 // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if 240 // we're on a compute capability 3.0 machine. 241 // (supported on CUDA only) 242 bool FindOnDiskForComputeCapability(absl::string_view filename, 243 absl::string_view canonical_suffix, 244 std::string* found_filename) const; 245 246 // Attempts to find a more specific version of the file indicated by 247 // filename by looking for AMDGPU ISA-specific suffixed versions. 248 // (supported on ROCm only) 249 250 bool FindOnDiskForISAVersion(absl::string_view filename, 251 absl::string_view canonical_suffix, 252 std::string* found_filename) const; 253 254 // Host callback landing routine invoked by CUDA. 255 // data: User-provided callback provided to HostCallback() above, captured 256 // as a std::function<void()>. Allocated/initialized inside 257 // HostCallback() and owned and deleted by this call. 258 static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status, 259 void* data); 260 261 // Collects metadata for the specified kernel. 262 port::Status GetKernelMetadata(GpuKernel* cuda_kernel, 263 KernelMetadata* kernel_metadata); 264 265 // Prints to VLOG(2) information about the kernel's occupancy and how it might 266 // be improved. 267 void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims, 268 const BlockDim& block_dims); 269 270 // (supported on CUDA only) 271 port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module) 272 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 273 274 // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated. 275 // (supported on CUDA only) 276 port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module) 277 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 278 279 // (supported on ROCm only) 280 port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module) 281 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 282 283 bool UnloadGpuBinary(const void* gpu_binary) 284 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 285 286 // Guards the on-disk-module mapping. 287 absl::Mutex disk_modules_mu_; 288 289 // Mapping from filename to GPUModuleHandle, if it was already retrieved. 290 // Multiple GPUFunctionHandle are usually obtained from a single 291 // GPUModuleHandle so we attempt to hit in this mapping first, before 292 // retrieving it. 293 std::map<std::string, GpuModuleHandle> disk_modules_ 294 TF_GUARDED_BY(disk_modules_mu_); 295 296 // Guards the in-memory-module mapping. 297 absl::Mutex in_memory_modules_mu_; 298 299 std::map<const char*, GpuModuleHandle> in_memory_modules_ 300 TF_GUARDED_BY(in_memory_modules_mu_); 301 302 // Kernel -> loaded GPU binary. Many kernels may load the same binary. 303 std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_ 304 TF_GUARDED_BY(in_memory_modules_mu_); 305 // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}. 306 std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>> 307 gpu_binary_to_module_ TF_GUARDED_BY(in_memory_modules_mu_); 308 309 // Guards the launched kernel set. 310 absl::Mutex launched_kernels_mu_; 311 312 // Keeps track of the set of launched kernels. Currently used to suppress the 313 // occupancy check on subsequent launches. 314 std::set<GpuFunctionHandle> launched_kernels_ 315 TF_GUARDED_BY(launched_kernels_mu_); 316 317 // Handle for the CUDA device being operated on. Immutable 318 // post-initialization. 319 GpuDeviceHandle device_; 320 321 // Handle for session with the library/driver. Immutable post-initialization. 322 GpuContext* context_; 323 324 // The device ordinal value that this executor was initialized with; recorded 325 // for use in getting device metadata. Immutable post-initialization. 326 int device_ordinal_; 327 328 // The major version of the compute capability for device_. 329 int cc_major_; 330 331 // The minor version of the compute capability for device_. 332 int cc_minor_; 333 334 // GPU ISA version for device_. 335 int version_; 336 337 // The plugin configuration associated with this instance. 338 PluginConfig plugin_config_; 339 340 SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor); 341 }; 342 343 } // namespace gpu 344 } // namespace stream_executor 345 346 #endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_ 347