1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM 17 #error This file must only be included when building with Cuda or ROCm support 18 #endif 19 20 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ 21 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ 22 23 #include <memory> 24 #include <string> 25 #include <unordered_map> 26 #include <vector> 27 28 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 29 #include "tensorflow/core/common_runtime/device_factory.h" 30 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 31 #include "tensorflow/core/common_runtime/gpu/gpu_id.h" 32 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" 33 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" 34 #include "tensorflow/core/common_runtime/gpu_device_context.h" 35 #include "tensorflow/core/common_runtime/local_device.h" 36 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h" 37 #include "tensorflow/core/common_runtime/shared_counter.h" 38 #include "tensorflow/core/framework/allocator.h" 39 #include "tensorflow/core/framework/device_base.h" 40 #include "tensorflow/core/framework/op_kernel.h" 41 #include "tensorflow/core/framework/tensor.h" 42 #include "tensorflow/core/lib/core/status.h" 43 #include "tensorflow/core/lib/gtl/inlined_vector.h" 44 #include "tensorflow/core/platform/mutex.h" 45 #include "tensorflow/core/platform/stream_executor.h" 46 #include "tensorflow/core/platform/types.h" 47 #include "tensorflow/core/public/session_options.h" 48 49 namespace tensorflow { 50 class GPUKernelTracker; 51 52 class BaseGPUDevice : public LocalDevice { 53 public: 54 BaseGPUDevice(const SessionOptions& options, const string& name, 55 Bytes memory_limit, const DeviceLocality& locality, 56 TfGpuId tf_gpu_id, const string& physical_device_desc, 57 Allocator* gpu_allocator, Allocator* cpu_allocator, 58 bool sync_every_op, int32 max_streams); 59 60 ~BaseGPUDevice() override; 61 62 // Initialize the device and return the status of initialization. 63 Status Init(const SessionOptions& options); 64 65 // GPU devices require the Op Compute method to save a reference to 66 // any temporary tensors that are allocated until the Op execution 67 // completes. 68 bool RequiresRecordingAccessedTensors() const override; 69 70 // GPU kernel execution requires us to use `tracing::ScopedAnnotation()` 71 // rather than `tracing::ScopedActivity()`, in order to relate asynchronously 72 // launched GPU kernels to the OpKernel. TraceUsingAnnotations()73 bool TraceUsingAnnotations() const { return true; } 74 75 void ConsumeListOfAccessedTensors( 76 DeviceContext* device_context, 77 const TensorReferenceVector& tensor_refs) override; 78 79 Status FillContextMap(const Graph* graph, 80 DeviceContextMap* device_context_map) override; 81 82 void Compute(OpKernel* op_kernel, OpKernelContext* context) override; 83 84 Status Sync() override; 85 86 void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, 87 AsyncOpKernel::DoneCallback done) override; 88 89 Status MakeTensorFromProto(const TensorProto& tensor_proto, 90 const AllocatorAttributes alloc_attrs, 91 Tensor* tensor) override; 92 93 // The caller owns the returned device. 94 PerOpGpuDevice* MakeGpuDevice() override; 95 96 Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, 97 DeviceContext* dc, 98 Allocator* allocator) override; 99 100 // Returns the platform GPU id of this device within the native driver system; 101 // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system. gpu_id()102 int gpu_id() const { 103 PlatformGpuId platform_gpu_id; 104 TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id)); 105 return platform_gpu_id.value(); 106 } 107 108 // The executor that provides control for the device; e.g., for CUDA this 109 // corresponds to the cuda context. executor()110 se::StreamExecutor* executor() const { return executor_; } 111 112 Allocator* GetScopedAllocator(AllocatorAttributes attr, 113 int64 step_id) override; 114 GetScopedAllocatorMgr()115 ScopedAllocatorMgr* GetScopedAllocatorMgr() const override { 116 return scoped_allocator_mgr_.get(); 117 } 118 119 // The following two functions always return 0 unless one of the 120 // related experimental config options has been specified. 121 122 // If returned value is > 0 then GPU Memory chunks freed before this count 123 // are guaranteed not to be in use by any kernel pending on this device. 124 uint64 SafeAllocFrontier() override; 125 126 // Returns the number of kernels that have been queued for execution on 127 // the compute stream and are not yet known to have completed. 128 int PendingKernels(); 129 130 protected: 131 Allocator* gpu_allocator_; // not owned 132 Allocator* cpu_allocator_; // not owned 133 134 se::StreamExecutor* executor_; // not owned 135 std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_; 136 137 private: 138 struct StreamGroup { 139 se::Stream* compute = nullptr; 140 se::Stream* host_to_device = nullptr; 141 se::Stream* device_to_host = nullptr; 142 gtl::InlinedVector<se::Stream*, 4> device_to_device; 143 }; 144 class StreamGroupFactory; 145 146 gtl::InlinedVector<StreamGroup*, 4> streams_; 147 mutex scratch_init_mutex_; 148 gtl::InlinedVector<char*, 4> scratch_; 149 std::vector<GPUDeviceContext*> device_contexts_; 150 GpuDeviceInfo* gpu_device_info_ = nullptr; 151 mutex trace_mu_; 152 TfGpuId tf_gpu_id_; 153 const bool sync_every_op_ = false; 154 const int32 max_streams_; 155 std::unique_ptr<EventMgr> em_; 156 std::unique_ptr<thread::ThreadPool> thread_pool_; 157 std::unique_ptr<GPUKernelTracker> kernel_tracker_; 158 int pending_cap_ = 0; 159 bool timestamped_allocator_ = false; 160 161 // Initialize scractch buffers used by Eigen. 162 Status InitScratchBuffers(); 163 164 void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device, 165 int stream_id, Allocator* allocator); 166 167 void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context); 168 169 string ComputeOpKernelDebugString(const OpKernel& op_kernel, 170 const int& stream_id); 171 172 // This method returns an initialization status, in addition to 173 // calling the "done" StatusCallback, if there is a failure to 174 // allocate memory or if the tensor "from" is not DMA-copyable. 175 // If there is no error prior to enqueueing the copy, an OK status 176 // is returned. 177 Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs, 178 const Tensor& from, Tensor* to, 179 StatusCallback done); 180 }; 181 182 // A per-compute-stream utility that keeps track of kernels that have been 183 // queued for execution but may not yet have terminated, and also the queued 184 // time of the most recently terminated kernel. 185 class GPUKernelTracker { 186 public: 187 // If we're going to share a SharedCounter with an allocator, it's owned 188 // by the allocator because allocators are initialized once per process. 189 // Devices are per-session. GPUKernelTracker(Env * env,SharedCounter * timing_counter)190 explicit GPUKernelTracker(Env* env, SharedCounter* timing_counter) 191 : env_(env), timing_counter_(timing_counter), pending_kernels_(64) { 192 if (!timing_counter_) { 193 // There's not a preexisting counter owned by GPUProcessState, i.e. 194 // pending_cap > 0 but timestamped_allocator == false. 195 owned_counter_.reset(new SharedCounter); 196 timing_counter_ = owned_counter_.get(); 197 } 198 } 199 200 // Record that a GPU kernel has just been enqueued on the compute stream. 201 // Inserts a new timing counter value in a new PendingKernel record appended 202 // to the end of the ring buffer then returns that same count. 203 uint64 RecordQueued(); 204 205 // Takes a count value returned by RecordQueued and finds the corresponding 206 // PendingKernel record in the ring buffer. Marks the kernel as completed and 207 // advances the completion frontier accordingly. 208 void RecordTerminated(uint64 at_count); 209 210 // Returns the largest timing count such that all kernels queued no 211 // later than that count are known to have terminated. 212 uint64 LastTerminatedCount(); 213 214 // Returns the number of kernels enqueued that are not yet known to 215 // have terminated. NumPending()216 int NumPending() { 217 mutex_lock l(mu_); 218 return num_pending_; 219 } 220 221 // Yield current thread until number of pending kernels no longer 222 // exceeds the cap. PauseWhilePendingExceeds(int cap)223 void PauseWhilePendingExceeds(int cap) { 224 mutex_lock l(mu_); 225 while (num_pending_ > cap) { 226 pending_decreased_.wait(l); 227 } 228 } 229 230 private: 231 Env* env_; 232 SharedCounter* timing_counter_; 233 std::unique_ptr<SharedCounter> owned_counter_; 234 235 // Records when a kernel was queued for execution. Kernel launches are 236 // identified by a unique count value from a per-GPU device timing counter. 237 struct PendingKernel { 238 uint64 queued_count; 239 bool terminated; PendingKernelPendingKernel240 PendingKernel(const PendingKernel& pk) 241 : queued_count(pk.queued_count), terminated(pk.terminated) {} PendingKernelPendingKernel242 PendingKernel() : queued_count(0), terminated(false) {} 243 }; 244 mutex mu_; 245 // Ring buffer of PendingKernel records. 246 std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_); 247 // Next unused slot in pending_kernels_. 248 int first_available_ GUARDED_BY(mu_) = 0; 249 // Last completed PendingKernel such that all prior PendingKernels are 250 // also completed. With out-of-order completion there may be a mixture 251 // of completed and uncompleted entries between last_completed_ and 252 // first_available_, hence num_pending_ is not guaranteed equal to 253 // their differerence. 254 int last_completed_ GUARDED_BY(mu_) = -1; 255 int num_pending_ GUARDED_BY(mu_) = 0; 256 condition_variable pending_decreased_ GUARDED_BY(mu_); 257 }; 258 259 class BaseGPUDeviceFactory : public DeviceFactory { 260 public: 261 Status CreateDevices(const SessionOptions& options, const string& name_prefix, 262 std::vector<std::unique_ptr<Device>>* devices) override; 263 264 struct InterconnectMap { 265 // Name of interconnect technology, if known. 266 string name; 267 // If possible, strength should approximate Gb/sec bandwidth rate. 268 // Where architecture-specific subclassing is not done that won't 269 // always be possible. The minimum expectation is that 270 // faster links should have a higher value than slower links. 271 int32 strength; 272 static const int kSameDeviceStrength; 273 static const int kStreamExecutorStrength; 274 std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links; 275 }; 276 277 protected: 278 // Populates *maps with interconnect maps for all local direct access 279 // pathways between GPUs. 280 virtual Status GetInterconnectMaps( 281 const std::vector<PlatformGpuId>& visible_gpu_order, 282 se::Platform* gpu_manager, std::vector<InterconnectMap>* maps); 283 284 struct TfGpuIdHash { operatorTfGpuIdHash285 std::size_t operator()(const TfGpuId& id) const noexcept { 286 return std::hash<int>{}(id.value()); 287 } 288 }; 289 typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap; 290 // Populates *localities with the DeviceLocality descriptor for 291 // every TfGpuId. 292 virtual Status GetDeviceLocalities( 293 int num_tf_gpus, const std::vector<InterconnectMap>& interconnects, 294 LocalityMap* localities); 295 296 private: 297 // Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly) 298 // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices' 299 // vector. 300 Status CreateGPUDevice(const SessionOptions& options, 301 const string& name_prefix, TfGpuId tf_gpu_id, 302 int64 memory_limit, const DeviceLocality& dev_locality, 303 std::vector<std::unique_ptr<Device>>* devices); 304 305 virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice( 306 const SessionOptions& options, const string& name, Bytes memory_limit, 307 const DeviceLocality& dev_locality, TfGpuId tf_gpu_id, 308 const string& physical_device_desc, Allocator* gpu_allocator, 309 Allocator* cpu_allocator) = 0; 310 311 // Returns into 'ids' the list of valid platform GPU ids, in the order that 312 // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc, 313 // based upon 'visible_gpu_order' which was generated by parsing 314 // GPUOptions::visible_device_list which is a comma-separated list of CUDA or 315 // ROCm GPU ids. 316 Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order, 317 std::vector<PlatformGpuId>* ids); 318 319 // visible_gpu_initialized_[platform_gpu_id] is true if visible GPU 320 // platform_gpu_id has been initialized by the process. 321 std::unordered_map<int, bool> visible_gpu_initialized_; 322 }; 323 324 } // namespace tensorflow 325 326 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ 327