1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 18 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "absl/base/macros.h" 24 #include "absl/strings/string_view.h" 25 #include "tensorflow/core/framework/device_attributes.pb.h" 26 #include "tensorflow/core/framework/tensor.h" 27 #include "tensorflow/core/lib/core/errors.h" 28 #include "tensorflow/core/lib/core/refcount.h" 29 #include "tensorflow/core/lib/core/status.h" 30 #include "tensorflow/core/lib/core/stringpiece.h" 31 #include "tensorflow/core/platform/logging.h" 32 33 namespace Eigen { 34 struct ThreadPoolDevice; 35 } // end namespace Eigen 36 37 namespace stream_executor { 38 class Stream; 39 } // namespace stream_executor 40 41 namespace tensorflow { 42 43 class Device; 44 class DeviceAttributes; 45 class Env; 46 class EventMgr; 47 class OpKernelContext; 48 class ResourceMgr; 49 class ScopedAllocatorMgr; 50 class TensorProto; 51 52 namespace thread { 53 class ThreadPool; 54 } 55 56 // A wrapper for an Eigen Gpu Device that includes per-op state. The 57 // class is defined even for non-GPU devices since the 58 // OpKernelContext::Params structure wants to fill it in. 59 class PerOpGpuDevice { 60 public: ~PerOpGpuDevice()61 virtual ~PerOpGpuDevice() {} 62 virtual const Eigen::GpuDevice& device() const = 0; 63 }; 64 65 // A class that devices can subclass to pass around 66 // Device-specific context to OpKernels. 67 class DeviceContext : public core::RefCounted { 68 public: ~DeviceContext()69 ~DeviceContext() override {} stream()70 virtual stream_executor::Stream* stream() const { return nullptr; } MaintainLifetimeOnStream(const Tensor * t,stream_executor::Stream * stream)71 virtual void MaintainLifetimeOnStream(const Tensor* t, 72 stream_executor::Stream* stream) const { 73 } 74 75 // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into 76 // "device_tensor" which is on a non-CPU device "device". "device_tensor" 77 // must be allocated to be of the same size as "cpu_tensor". 78 virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, 79 Tensor* device_tensor, StatusCallback done, 80 bool sync_dst_compute = true) const { 81 done(errors::Internal("Unrecognized device type in CPU-to-device Copy")); 82 } 83 84 // Same as CopyCPUTensorToDevice, but in a synchronous way. 85 Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device, 86 Tensor* device_tensor) const; 87 88 // Copies a tensor in this device. CopyTensorInSameDevice(const Tensor * input_tensor,Device * device,Tensor * output_tensor,StatusCallback done)89 virtual void CopyTensorInSameDevice(const Tensor* input_tensor, 90 Device* device, Tensor* output_tensor, 91 StatusCallback done) const { 92 done(errors::Unimplemented("Copy in same device not implemented.")); 93 } 94 95 // "device_tensor" is a tensor on a non-CPU device. Copies 96 // device_tensor into "cpu_tensor". "cpu_tensor" must be allocated 97 // to be of the same size as "device_tensor". CopyDeviceTensorToCPU(const Tensor * device_tensor,StringPiece tensor_name,Device * device,Tensor * cpu_tensor,StatusCallback done)98 virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor, 99 StringPiece tensor_name, Device* device, 100 Tensor* cpu_tensor, StatusCallback done) { 101 done(errors::Internal("Unrecognized device type in device-to-CPU Copy")); 102 } 103 104 // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done. 105 Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor, 106 StringPiece tensor_name, Device* device, 107 Tensor* cpu_tensor); 108 109 // If possible, wait for all events on *stream to complete then execute func. 110 // A non-OK Status is returned otherwise. The stream argument should be the 111 // one provided by GpuDeviceInfo. This function is not applicable to devices 112 // that don't provide such a value. ThenExecute(Device * device,stream_executor::Stream * stream,std::function<void ()> func)113 virtual Status ThenExecute(Device* device, stream_executor::Stream* stream, 114 std::function<void()> func) { 115 return errors::Internal("ThenExecute not supported by device"); 116 } 117 118 // check if device is a pluggable device IsPluggableDevice()119 virtual bool IsPluggableDevice() { return false; } 120 }; 121 122 class DeviceBase { 123 public: DeviceBase(Env * env)124 explicit DeviceBase(Env* env) : env_(env) {} 125 virtual ~DeviceBase(); 126 env()127 Env* env() const { return env_; } 128 129 struct CpuWorkerThreads { 130 int num_threads = 0; 131 thread::ThreadPool* workers = nullptr; 132 }; 133 134 // Does not take ownership. set_tensorflow_cpu_worker_threads(CpuWorkerThreads * t)135 void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) { 136 cpu_worker_threads_ = t; 137 } 138 tensorflow_cpu_worker_threads()139 virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const { 140 CHECK(cpu_worker_threads_ != nullptr); 141 return cpu_worker_threads_; 142 } 143 144 // "stream" is used in special circumstances (such as the 145 // constructors of Ops) where there is no available OpKernelContext. 146 // "default_context" is used by OpKernelContext whenever a device does not 147 // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only 148 // using a single stream.) 149 // "event_mgr" is used to delay deallocation of temporary GPU buffers. 150 // TODO(pbar) Work out how to move this out of DeviceBase. 151 // GpuDeviceInfo name is an unfortunate legacy, it is used not only by GPUs 152 // but also by TPU devices (to provide default device context). 153 struct GpuDeviceInfo { 154 // Make sure all the defaults are NULL, so we can spot missing assignments. 155 stream_executor::Stream* stream = nullptr; 156 DeviceContext* default_context = nullptr; 157 EventMgr* event_mgr = nullptr; 158 int gpu_id = -1; 159 }; 160 161 // Does not take ownership. set_tensorflow_gpu_device_info(GpuDeviceInfo * g)162 void set_tensorflow_gpu_device_info(GpuDeviceInfo* g) { 163 gpu_device_info_ = g; 164 } 165 tensorflow_gpu_device_info()166 virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const { 167 return gpu_device_info_; 168 } 169 170 // The preferred thread pool for this device. If it is nullptr, the system 171 // automatically assigns a thread pool for execution. tensorflow_device_thread_pool()172 virtual thread::ThreadPool* tensorflow_device_thread_pool() { 173 return device_thread_pool_; 174 } 175 176 // Does not take ownership. 177 void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d); 178 179 // Return the Allocator implementation to use based on the allocator 180 // attributes requested. See allocator.h for more details. GetAllocator(AllocatorAttributes)181 virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) { 182 LOG(FATAL) << "GetAllocator() is not implemented."; 183 return nullptr; 184 } 185 186 // This method is provided for backwards compatibility, and will be removed 187 // in a future release. 188 ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.") GetStepAllocator(AllocatorAttributes attr,ResourceMgr *)189 Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) { 190 return GetAllocator(attr); 191 } 192 193 // Return an Allocator prepared for use in particular places by graph 194 // optimization GetScopedAllocator(AllocatorAttributes attr,int64 step_id)195 virtual Allocator* GetScopedAllocator(AllocatorAttributes attr, 196 int64 step_id) { 197 LOG(FATAL) << "Device does not implement GetScopedAllocator()"; 198 return nullptr; 199 } 200 GetScopedAllocatorMgr()201 virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; } 202 has_eigen_cpu_device()203 virtual bool has_eigen_cpu_device() const { 204 return !eigen_cpu_devices_.empty(); 205 } 206 207 virtual const Eigen::ThreadPoolDevice* eigen_cpu_device(); 208 209 // Caller owns the return value. The OpKernelContext calls this even 210 // for devices that do not implement an eigen_gpu_device. Overridden 211 // by GPU devices to return a derived type. MakeGpuDevice()212 virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; } 213 UnderlyingDevice()214 virtual DeviceBase* UnderlyingDevice() { return this; } UnderlyingDevice()215 virtual const DeviceBase* UnderlyingDevice() const { return this; } 216 217 // This is overridden by GPU devices to reinitialize the derived 218 // type returned by MakeGpuDevice. ReinitializeGpuDevice(OpKernelContext *,PerOpGpuDevice *,DeviceContext *,Allocator *)219 virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/, 220 PerOpGpuDevice* /*device*/, 221 DeviceContext* /*dc*/, 222 Allocator* /*allocator*/) { 223 return Status::OK(); 224 } 225 226 // Unimplemented by default 227 virtual const DeviceAttributes& attributes() const; NumaNode()228 virtual int NumaNode() const { return attributes().locality().numa_node(); } 229 virtual const std::string& name() const; 230 231 // Materializes the given TensorProto into 'tensor' stored in Device 232 // memory. Most devices will want to override this. 233 // 234 // TODO(vrv): We should be able to put this function into 235 // OpKernelContext and handle the copies from device memory via send 236 // and receive nodes, instead of requiring that each device handle 237 // the copies here as well as in copy ops. MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)238 virtual Status MakeTensorFromProto(const TensorProto& tensor_proto, 239 const AllocatorAttributes alloc_attrs, 240 Tensor* tensor) { 241 return errors::Internal("Device does not implement MakeTensorFromProto()"); 242 } 243 244 // Some devices (i.e. GPUs) may free device memory prior to its actual use 245 // being completed on the assumption that subsequent allocations can only be 246 // used serially with respect to pending uses. If this function returns a 247 // non-zero value it is the value of a device-specific counter such that any 248 // device memory tagged with an earlier freed-at count is really unencumbered 249 // by pending uses. For this to be useful the device memory allocator must 250 // be tagging deallocated memory chunks using the same counter. SafeAllocFrontier(uint64 old_value)251 virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; } 252 253 // Copies `input_tensor` to `output_tensor`, where both tensors are on this 254 // device. This function assumes that `output_tensor` has already been 255 // allocated with a buffer that is large enough to hold `input_tensor`'s data. 256 // Calls `done` from a device-specific thread after copy is finished, which 257 // may be the same as calling thread. 258 // 259 // NOTE(ayushd): This function is for TensorFlow internal use only. Deep copy 260 // is discouraged and should not be used in OpKernels. CopyTensorInSameDevice(const Tensor * input_tensor,Tensor * output_tensor,const DeviceContext * device_context,StatusCallback done)261 virtual void CopyTensorInSameDevice(const Tensor* input_tensor, 262 Tensor* output_tensor, 263 const DeviceContext* device_context, 264 StatusCallback done) { 265 done(errors::Internal("Device ", name(), " does not implement ", 266 "CopyTensorInSameDevice")); 267 } 268 269 protected: 270 // Does not take ownership. set_tensorflow_device_thread_pool(thread::ThreadPool * thread_pool)271 void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) { 272 device_thread_pool_ = thread_pool; 273 } 274 275 private: 276 Env* const env_; 277 CpuWorkerThreads* cpu_worker_threads_ = nullptr; 278 // Set by GPUs as well as by TPU devices. 279 GpuDeviceInfo* gpu_device_info_ = nullptr; 280 thread::ThreadPool* device_thread_pool_ = nullptr; 281 std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_; 282 }; 283 284 // Methods to create and check for Symbolic execution devices. 285 // Such devices are mostly used for TF-XLA bridge. TF should not treat these as 286 // normal devices. 287 void AddSymbolicExecutionDevice(absl::string_view device_name); 288 bool IsSymbolicExecutionDevice(absl::string_view device_name); 289 290 } // namespace tensorflow 291 292 #endif // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 293