1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // The CUDA implementation of the StreamExecutorInterface functionality. 17 // CUDA inclusions are ideally confined to this implementation file. 18 // 19 // The notions from the StreamExecutor basically correspond to the CUDA streams 20 // programming model provided by the libcuda.so driver APIs, so we don't have 21 // to do much more than wrap the calls to the libraries appropriately. 22 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_ 23 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_ 24 25 #include <set> 26 #include <unordered_map> 27 28 #include "absl/strings/string_view.h" 29 #include "tensorflow/stream_executor/event.h" 30 #include "tensorflow/stream_executor/gpu/gpu_kernel.h" 31 #include "tensorflow/stream_executor/lib/status.h" 32 #include "tensorflow/stream_executor/lib/statusor.h" 33 #include "tensorflow/stream_executor/platform.h" 34 #include "tensorflow/stream_executor/platform/mutex.h" 35 #include "tensorflow/stream_executor/platform/port.h" 36 #include "tensorflow/stream_executor/platform/thread_annotations.h" 37 #include "tensorflow/stream_executor/stream_executor_internal.h" 38 39 namespace stream_executor { 40 namespace gpu { 41 42 // CUDA-platform implementation of the platform-agnostic 43 // StreamExecutorInferface. 44 class GpuExecutor : public internal::StreamExecutorInterface { 45 public: 46 // sub_platform indicates the subplatform used in this executor; it must 47 // be a CUDA type. GpuExecutor(const PluginConfig & plugin_config)48 explicit GpuExecutor(const PluginConfig& plugin_config) 49 : device_(0), 50 context_(nullptr), 51 device_ordinal_(0), 52 cc_major_(0), 53 cc_minor_(0), 54 version_(0), 55 plugin_config_(plugin_config) {} 56 57 // See the corresponding StreamExecutor methods for method comments on the 58 // following overrides. 59 60 ~GpuExecutor() override; 61 62 port::Status Init(int device_ordinal, DeviceOptions device_options) override; 63 64 bool GetKernel(const MultiKernelLoaderSpec& spec, 65 KernelBase* kernel) override; 66 // (supported on CUDA only) 67 void UnloadKernel(const KernelBase* kernel) override; 68 bool LoadModule(const MultiModuleLoaderSpec& spec, 69 ModuleHandle* module_handle) override; 70 bool UnloadModule(ModuleHandle module_handle) override; 71 72 bool Launch(Stream* stream, const ThreadDim& thread_dims, 73 const BlockDim& block_dims, const KernelBase& k, 74 const KernelArgsArrayBase& args) override; 75 76 // (supported on CUDA only) 77 int CalculateOccupancy(const DeviceDescription& device_description, 78 uint64 registers_per_thread, 79 uint64 shared_memory_per_block, 80 const ThreadDim& thread_dims, GpuFunctionHandle func); 81 82 // (supported on CUDA only) 83 int CompareOccupancy(int* initial_blocks, 84 const DeviceDescription& device_description, 85 uint64 registers_per_thread, 86 uint64 shared_memory_per_block, 87 const ThreadDim& thread_dims, GpuFunctionHandle func); 88 89 void* Allocate(uint64 size) override; 90 91 void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes, 92 uint64 size_bytes) override; 93 94 void Deallocate(DeviceMemoryBase* mem) override; 95 UnifiedMemoryAllocate(uint64 size)96 void* UnifiedMemoryAllocate(uint64 size) override { 97 return GpuDriver::UnifiedMemoryAllocate(context_, size); 98 } 99 UnifiedMemoryDeallocate(void * location)100 void UnifiedMemoryDeallocate(void* location) override { 101 return GpuDriver::UnifiedMemoryDeallocate(context_, location); 102 } 103 104 // CUDA allocation/registration functions are necessary because the driver 105 // internally sets up buffers for DMA operations (and page locks them). 106 // There's no external interface for us to otherwise control these DMA 107 // settings. HostMemoryAllocate(uint64 size)108 void* HostMemoryAllocate(uint64 size) override { 109 return GpuDriver::HostAllocate(context_, size); 110 } 111 HostMemoryDeallocate(void * location)112 void HostMemoryDeallocate(void* location) override { 113 return GpuDriver::HostDeallocate(context_, location); 114 } 115 116 bool HostMemoryRegister(void* location, uint64 size) override; 117 118 bool HostMemoryUnregister(void* location) override; 119 120 bool SynchronizeAllActivity() override; 121 122 bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override; 123 124 bool SynchronousMemSet(DeviceMemoryBase* location, int value, 125 uint64 size) override; 126 127 port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst, 128 const void* host_src, uint64 size) override; 129 130 port::Status SynchronousMemcpy(void* host_dst, 131 const DeviceMemoryBase& gpu_src, 132 uint64 size) override; 133 134 port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst, 135 const DeviceMemoryBase& gpu_src, 136 uint64 size) override; 137 138 bool MemZero(Stream* stream, DeviceMemoryBase* location, 139 uint64 size) override; 140 bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern, 141 uint64 size) override; 142 bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern, 143 uint64 size) override; 144 145 bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src, 146 uint64 size) override; 147 148 bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src, 149 uint64 size) override; 150 151 bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst, 152 const DeviceMemoryBase& gpu_src, 153 uint64 size) override; 154 155 bool HostCallback(Stream* stream, 156 std::function<port::Status()> callback) override; 157 158 bool AllocateStream(Stream* stream) override; 159 160 void DeallocateStream(Stream* stream) override; 161 162 bool CreateStreamDependency(Stream* dependent, Stream* other) override; 163 164 bool AllocateTimer(Timer* timer) override; 165 166 void DeallocateTimer(Timer* timer) override; 167 168 bool StartTimer(Stream* stream, Timer* timer) override; 169 170 bool StopTimer(Stream* stream, Timer* timer) override; 171 172 port::Status AllocateEvent(Event* event) override; 173 174 port::Status DeallocateEvent(Event* event) override; 175 176 port::Status RecordEvent(Stream* stream, Event* event) override; 177 178 port::Status WaitForEvent(Stream* stream, Event* event) override; 179 180 Event::Status PollForEventStatus(Event* event) override; 181 182 port::Status BlockHostUntilDone(Stream* stream) override; 183 PlatformDeviceCount()184 int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); } 185 186 port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override; 187 188 bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override; 189 190 SharedMemoryConfig GetDeviceSharedMemoryConfig() override; 191 192 port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override; 193 194 bool DeviceMemoryUsage(int64* free, int64* total) const override; 195 196 // Search for the symbol and returns a device pointer and size. 197 // Returns false if symbol does not exist. 198 bool GetSymbol(const string& symbol_name, ModuleHandle module_handle, 199 void** mem, size_t* bytes) override; 200 201 DeviceDescription* PopulateDeviceDescription() const override; 202 203 // Populates the block_dim_limit by querying the device driver API. If an 204 // error occurs at any point while asking the driver for block dim limits, it 205 // will be only partially populated as a result, and an error will be logged. 206 bool FillBlockDimLimit(BlockDim* block_dim_limit) const; 207 208 bool SupportsBlas() const override; 209 210 blas::BlasSupport* CreateBlas() override; 211 212 bool SupportsFft() const override; 213 214 fft::FftSupport* CreateFft() override; 215 216 bool SupportsRng() const override; 217 218 rng::RngSupport* CreateRng() override; 219 220 bool SupportsDnn() const override; 221 222 dnn::DnnSupport* CreateDnn() override; 223 224 std::unique_ptr<internal::EventInterface> CreateEventImplementation() 225 override; 226 227 std::unique_ptr<internal::KernelInterface> CreateKernelImplementation() 228 override; 229 230 std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override; 231 232 std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override; 233 234 void* GpuContextHack() override; 235 236 GpuContext* gpu_context(); 237 238 private: 239 // Attempts to find a more specific version of the file indicated by 240 // filename by looking for compute-capability-specific suffixed versions; i.e. 241 // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if 242 // we're on a compute capability 3.0 machine. 243 // (supported on CUDA only) 244 bool FindOnDiskForComputeCapability(absl::string_view filename, 245 absl::string_view canonical_suffix, 246 string* found_filename) const; 247 248 // Attempts to find a more specific version of the file indicated by 249 // filename by looking for AMDGPU ISA-specific suffixed versions. 250 // (supported on ROCm only) 251 252 bool FindOnDiskForISAVersion(absl::string_view filename, 253 absl::string_view canonical_suffix, 254 string* found_filename) const; 255 256 // Host callback landing routine invoked by CUDA. 257 // data: User-provided callback provided to HostCallback() above, captured 258 // as a std::function<void()>. Allocated/initialized inside 259 // HostCallback() and owned and deleted by this call. 260 static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status, 261 void* data); 262 263 // Collects metadata for the specified kernel. 264 bool GetKernelMetadata(GpuKernel* cuda_kernel, 265 KernelMetadata* kernel_metadata); 266 267 // Prints to VLOG(2) information about the kernel's occupancy and how it might 268 // be improved. 269 void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims, 270 const BlockDim& block_dims); 271 272 // (supported on CUDA only) 273 bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module) 274 EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 275 276 // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated. 277 // (supported on CUDA only) 278 bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module) 279 EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 280 281 // (supported on ROCm only) 282 bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module) 283 EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 284 285 bool UnloadGpuBinary(const void* gpu_binary) 286 EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); 287 288 // Guards the on-disk-module mapping. 289 mutex disk_modules_mu_; 290 291 // Mapping from filename to GPUModuleHandle, if it was already retrieved. 292 // Multiple GPUFunctionHandle are usually obtained from a single 293 // GPUModuleHandle so we attempt to hit in this mapping first, before 294 // retrieving it. 295 std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_); 296 297 // Guards the in-memory-module mapping. 298 mutex in_memory_modules_mu_; 299 300 std::map<const char*, GpuModuleHandle> in_memory_modules_ 301 GUARDED_BY(in_memory_modules_mu_); 302 303 // Kernel -> loaded GPU binary. Many kernels may load the same binary. 304 std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_ 305 GUARDED_BY(in_memory_modules_mu_); 306 // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}. 307 std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>> 308 gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_); 309 310 // Guards the launched kernel set. 311 mutex launched_kernels_mu_; 312 313 // Keeps track of the set of launched kernels. Currently used to suppress the 314 // occupancy check on subsequent launches. 315 std::set<GpuFunctionHandle> launched_kernels_ 316 GUARDED_BY(launched_kernels_mu_); 317 318 // Handle for the CUDA device being operated on. Immutable 319 // post-initialization. 320 GpuDeviceHandle device_; 321 322 // Handle for session with the library/driver. Immutable post-initialization. 323 GpuContext* context_; 324 325 // The device ordinal value that this executor was initialized with; recorded 326 // for use in getting device metadata. Immutable post-initialization. 327 int device_ordinal_; 328 329 // The major verion of the compute capability for device_. 330 int cc_major_; 331 332 // The minor verion of the compute capability for device_. 333 int cc_minor_; 334 335 // GPU ISA version for device_. 336 int version_; 337 338 // The plugin configuration associated with this instance. 339 PluginConfig plugin_config_; 340 341 SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor); 342 }; 343 344 } // namespace gpu 345 } // namespace stream_executor 346 347 #endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_ 348