1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 18 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "absl/base/macros.h" 24 #include "tensorflow/core/framework/tensor.h" 25 #include "tensorflow/core/lib/core/errors.h" 26 #include "tensorflow/core/lib/core/refcount.h" 27 #include "tensorflow/core/lib/core/status.h" 28 #include "tensorflow/core/lib/core/stringpiece.h" 29 #include "tensorflow/core/platform/logging.h" 30 31 namespace Eigen { 32 struct ThreadPoolDevice; 33 #ifdef TENSORFLOW_USE_SYCL 34 struct SyclDevice; 35 #endif 36 } // end namespace Eigen 37 38 namespace stream_executor { 39 class Stream; 40 } // namespace stream_executor 41 42 namespace tensorflow { 43 44 class Device; 45 class DeviceAttributes; 46 class Env; 47 class EventMgr; 48 class OpKernelContext; 49 class ResourceMgr; 50 class ScopedAllocatorMgr; 51 class TensorProto; 52 53 namespace thread { 54 class ThreadPool; 55 } 56 57 // A wrapper for an Eigen Gpu Device that includes per-op state. The 58 // class is defined even for non-GPU devices since the 59 // OpKernelContext::Params structure wants to fill it in. 60 class PerOpGpuDevice { 61 public: ~PerOpGpuDevice()62 virtual ~PerOpGpuDevice() {} 63 virtual const Eigen::GpuDevice& device() const = 0; 64 }; 65 66 // A class that devices can subclass to pass around 67 // Device-specific context to OpKernels. 68 class DeviceContext : public core::RefCounted { 69 public: ~DeviceContext()70 ~DeviceContext() override {} stream()71 virtual stream_executor::Stream* stream() const { return nullptr; } MaintainLifetimeOnStream(const Tensor * t,stream_executor::Stream * stream)72 virtual void MaintainLifetimeOnStream(const Tensor* t, 73 stream_executor::Stream* stream) const { 74 } 75 76 // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into 77 // "device_tensor" which is on a GPU device "device". "device_tensor" 78 // must be allocated to be of the same size as "cpu_tensor". CopyCPUTensorToDevice(const Tensor * cpu_tensor,Device * device,Tensor * device_tensor,StatusCallback done)79 virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, 80 Tensor* device_tensor, 81 StatusCallback done) const { 82 done(errors::Internal("Unrecognized device type in CPU-to-device Copy")); 83 } 84 85 // Copies a tensor in this device. CopyTensorInSameDevice(const Tensor * input_tensor,Device * device,Tensor * output_tensor,StatusCallback done)86 virtual void CopyTensorInSameDevice(const Tensor* input_tensor, 87 Device* device, Tensor* output_tensor, 88 StatusCallback done) const { 89 done(errors::Unimplemented("Copy in same device not implemented.")); 90 } 91 92 // "device_tensor" is a tensor on a non-CPU device. Copies 93 // device_tensor into "cpu_tensor". "cpu_tensor" must be allocated 94 // to be of the same size as "device_tensor". CopyDeviceTensorToCPU(const Tensor * device_tensor,StringPiece tensor_name,Device * device,Tensor * cpu_tensor,StatusCallback done)95 virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor, 96 StringPiece tensor_name, Device* device, 97 Tensor* cpu_tensor, StatusCallback done) { 98 done(errors::Internal("Unrecognized device type in device-to-CPU Copy")); 99 } 100 101 // If possible, wait for all events on *stream to complete then execute func. 102 // A non-OK Status is returned otherwise. The stream argument should be the 103 // one provided by GpuDeviceInfo. This function is not applicable to devices 104 // that don't provide such a value. ThenExecute(Device * device,stream_executor::Stream * stream,std::function<void ()> func)105 virtual Status ThenExecute(Device* device, stream_executor::Stream* stream, 106 std::function<void()> func) { 107 return errors::Internal("ThenExecute not supported by device"); 108 } 109 }; 110 111 // map[i] is the DeviceContext* for the node with id i, if i < map.size(). 112 typedef std::vector<DeviceContext*> DeviceContextMap; 113 114 class DeviceBase { 115 public: DeviceBase(Env * env)116 explicit DeviceBase(Env* env) : env_(env) {} 117 virtual ~DeviceBase(); 118 env()119 Env* env() const { return env_; } 120 121 // Override this to return true for devices that require an Op's 122 // compute method to save references to the temporary tensors it 123 // allocates until the Op execution completes RequiresRecordingAccessedTensors()124 virtual bool RequiresRecordingAccessedTensors() const { return false; } 125 126 struct CpuWorkerThreads { 127 int num_threads = 0; 128 thread::ThreadPool* workers = nullptr; 129 }; 130 131 // Does not take ownership. set_tensorflow_cpu_worker_threads(CpuWorkerThreads * t)132 void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) { 133 cpu_worker_threads_ = t; 134 } 135 tensorflow_cpu_worker_threads()136 virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const { 137 CHECK(cpu_worker_threads_ != nullptr); 138 return cpu_worker_threads_; 139 } 140 141 // "stream" is used in special circumstances (such as the 142 // constructors of Ops) where there is no available OpKernelContext. 143 // "default_context" is used by OpKernelContext whenever a device does not 144 // supply a DeviceContext for an op in FillContextMap (e.g. when only 145 // using a single stream.) 146 // "event_mgr" is used to delay deallocation of temporary GPU buffers. 147 // TODO(pbar) Work out how to move this out of DeviceBase. 148 // GpuDeviceInfo name is an unfortunate legacy, it is used not only by GPUs 149 // but also by TPU devices (to provide default device context). 150 struct GpuDeviceInfo { 151 // Make sure all the defaults are NULL, so we can spot missing assignments. 152 stream_executor::Stream* stream = nullptr; 153 DeviceContext* default_context = nullptr; 154 EventMgr* event_mgr = nullptr; 155 int gpu_id = -1; 156 }; 157 158 // Does not take ownership. set_tensorflow_gpu_device_info(GpuDeviceInfo * g)159 void set_tensorflow_gpu_device_info(GpuDeviceInfo* g) { 160 gpu_device_info_ = g; 161 } 162 tensorflow_gpu_device_info()163 virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const { 164 return gpu_device_info_; 165 } 166 167 // The preferred thread pool for this device. If it is nullptr, the system 168 // automatically assigns a thread pool for execution. tensorflow_device_thread_pool()169 virtual thread::ThreadPool* tensorflow_device_thread_pool() { 170 return device_thread_pool_; 171 } 172 173 // Does not take ownership. 174 void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d); 175 176 #ifdef TENSORFLOW_USE_SYCL set_eigen_sycl_device(Eigen::SyclDevice * d)177 void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; } 178 #endif 179 180 // Return the Allocator implementation to use based on the allocator 181 // attributes requested. See allocator.h for more details. GetAllocator(AllocatorAttributes)182 virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) { 183 LOG(FATAL) << "GetAllocator() is not implemented."; 184 return nullptr; 185 } 186 187 // This method is provided for backwards compatibility, and will be removed 188 // in a future release. 189 ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.") GetStepAllocator(AllocatorAttributes attr,ResourceMgr *)190 Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) { 191 return GetAllocator(attr); 192 } 193 194 // Return an Allocator prepared for use in particular places by graph 195 // optimization GetScopedAllocator(AllocatorAttributes attr,int64 step_id)196 virtual Allocator* GetScopedAllocator(AllocatorAttributes attr, 197 int64 step_id) { 198 LOG(FATAL) << "Device does not implement GetScopedAllocator()"; 199 return nullptr; 200 } 201 GetScopedAllocatorMgr()202 virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; } 203 has_eigen_cpu_device()204 bool has_eigen_cpu_device() const { return !eigen_cpu_devices_.empty(); } 205 206 virtual const Eigen::ThreadPoolDevice* eigen_cpu_device(); 207 208 #ifdef TENSORFLOW_USE_SYCL eigen_sycl_device()209 virtual const Eigen::SyclDevice* eigen_sycl_device() const { 210 CHECK(eigen_sycl_device_ != nullptr); 211 return eigen_sycl_device_; 212 } 213 #endif 214 215 // Caller owns the return value. The OpKernelContext calls this even 216 // for devices that do not implement an eigen_gpu_device. Overridden 217 // by GPU devices to return a derived type. MakeGpuDevice()218 virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; } 219 UnderlyingDevice()220 virtual DeviceBase* UnderlyingDevice() { return this; } UnderlyingDevice()221 virtual const DeviceBase* UnderlyingDevice() const { return this; } 222 223 // This is overridden by GPU devices to reinitialize the derived 224 // type returned by MakeGpuDevice. ReinitializeGpuDevice(OpKernelContext *,PerOpGpuDevice *,DeviceContext *,Allocator *)225 virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/, 226 PerOpGpuDevice* /*device*/, 227 DeviceContext* /*dc*/, 228 Allocator* /*allocator*/) { 229 return Status::OK(); 230 } 231 232 // Unimplemented by default 233 virtual const DeviceAttributes& attributes() const; 234 virtual const string& name() const; 235 236 // Materializes the given TensorProto into 'tensor' stored in Device 237 // memory. Most devices will want to override this. 238 // 239 // TODO(vrv): We should be able to put this function into 240 // OpKernelContext and handle the copies from device memory via send 241 // and receive nodes, instead of requiring that each device handle 242 // the copies here as well as in copy ops. MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)243 virtual Status MakeTensorFromProto(const TensorProto& tensor_proto, 244 const AllocatorAttributes alloc_attrs, 245 Tensor* tensor) { 246 return errors::Internal("Device does not implement MakeTensorFromProto()"); 247 } 248 249 // Some devices (i.e. GPUs) may free device memory prior to its actual use 250 // being completed on the assumption that subsequent allocations can only be 251 // used serially with respect to pending uses. If this function returns a 252 // non-zero value it is the value of a device-specific counter such that any 253 // device memory tagged with an earlier freed-at count is really unencumbered 254 // by pending uses. For this to be useful the device memory allocator must 255 // be tagging deallocated memory chunks using the same counter. SafeAllocFrontier()256 virtual uint64 SafeAllocFrontier() { return 0; } 257 258 protected: 259 // Does not take ownership. set_tensorflow_device_thread_pool(thread::ThreadPool * thread_pool)260 void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) { 261 device_thread_pool_ = thread_pool; 262 } 263 264 private: 265 Env* const env_; 266 CpuWorkerThreads* cpu_worker_threads_ = nullptr; 267 // Set by GPUs as well as by TPU devices. 268 GpuDeviceInfo* gpu_device_info_ = nullptr; 269 thread::ThreadPool* device_thread_pool_ = nullptr; 270 std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_; 271 #ifdef TENSORFLOW_USE_SYCL 272 Eigen::SyclDevice* eigen_sycl_device_ = nullptr; 273 #endif 274 }; 275 276 } // namespace tensorflow 277 278 #endif // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 279