1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 18 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "absl/base/macros.h" 24 #include "absl/strings/string_view.h" 25 #include "tensorflow/core/framework/device_attributes.pb.h" 26 #include "tensorflow/core/framework/tensor.h" 27 #include "tensorflow/core/lib/core/errors.h" 28 #include "tensorflow/core/lib/core/refcount.h" 29 #include "tensorflow/core/lib/core/status.h" 30 #include "tensorflow/core/lib/core/stringpiece.h" 31 #include "tensorflow/core/platform/logging.h" 32 #include "tensorflow/core/util/device_name_utils.h" 33 34 namespace Eigen { 35 struct ThreadPoolDevice; 36 } // end namespace Eigen 37 38 namespace stream_executor { 39 class Stream; 40 } // namespace stream_executor 41 42 namespace tensorflow { 43 44 class Device; 45 class DeviceAttributes; 46 class Env; 47 class EventMgr; 48 class OpKernelContext; 49 class ResourceMgr; 50 class ScopedAllocatorMgr; 51 class TensorProto; 52 53 namespace thread { 54 class ThreadPool; 55 } 56 57 // A wrapper for an Eigen Gpu Device that includes per-op state. The 58 // class is defined even for non-GPU devices since the 59 // OpKernelContext::Params structure wants to fill it in. 60 class PerOpGpuDevice { 61 public: ~PerOpGpuDevice()62 virtual ~PerOpGpuDevice() {} 63 virtual const Eigen::GpuDevice& device() const = 0; 64 }; 65 66 // A class that devices can subclass to pass around 67 // Device-specific context to OpKernels. 68 class DeviceContext : public core::RefCounted { 69 public: ~DeviceContext()70 ~DeviceContext() override {} stream()71 virtual stream_executor::Stream* stream() const { return nullptr; } MaintainLifetimeOnStream(const Tensor * t,stream_executor::Stream * stream)72 virtual void MaintainLifetimeOnStream(const Tensor* t, 73 stream_executor::Stream* stream) const { 74 } 75 76 // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into 77 // "device_tensor" which is on a non-CPU device "device". "device_tensor" 78 // must be allocated to be of the same size as "cpu_tensor". 79 virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, 80 Tensor* device_tensor, StatusCallback done, 81 bool sync_dst_compute = true) const { 82 done(errors::Internal("Unrecognized device type in CPU-to-device Copy")); 83 } 84 85 // Same as CopyCPUTensorToDevice, but in a synchronous way. 86 Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device, 87 Tensor* device_tensor) const; 88 89 // Copies a tensor in this device. CopyTensorInSameDevice(const Tensor * input_tensor,Device * device,Tensor * output_tensor,StatusCallback done)90 virtual void CopyTensorInSameDevice(const Tensor* input_tensor, 91 Device* device, Tensor* output_tensor, 92 StatusCallback done) const { 93 done(errors::Unimplemented("Copy in same device not implemented.")); 94 } 95 96 // "device_tensor" is a tensor on a non-CPU device. Copies 97 // device_tensor into "cpu_tensor". "cpu_tensor" must be allocated 98 // to be of the same size as "device_tensor". CopyDeviceTensorToCPU(const Tensor * device_tensor,StringPiece tensor_name,Device * device,Tensor * cpu_tensor,StatusCallback done)99 virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor, 100 StringPiece tensor_name, Device* device, 101 Tensor* cpu_tensor, StatusCallback done) { 102 done(errors::Internal("Unrecognized device type in device-to-CPU Copy")); 103 } 104 105 // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done. 106 Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor, 107 StringPiece tensor_name, Device* device, 108 Tensor* cpu_tensor); 109 110 // If possible, wait for all events on *stream to complete then execute func. 111 // A non-OK Status is returned otherwise. The stream argument should be the 112 // one provided by AcceleratorDeviceInfo. This function is not applicable to 113 // devices that don't provide such a value. ThenExecute(Device * device,stream_executor::Stream * stream,std::function<void ()> func)114 virtual Status ThenExecute(Device* device, stream_executor::Stream* stream, 115 std::function<void()> func) { 116 return errors::Internal("ThenExecute not supported by device"); 117 } 118 119 // check if device is a pluggable device IsPluggableDevice()120 virtual bool IsPluggableDevice() { return false; } 121 122 // Returns the pinned host memory allocator for the device. host_memory_allocator()123 virtual Allocator* host_memory_allocator() const { return nullptr; } 124 }; 125 126 class DeviceBase { 127 public: DeviceBase(Env * env)128 explicit DeviceBase(Env* env) : env_(env) {} 129 virtual ~DeviceBase(); 130 env()131 Env* env() const { return env_; } 132 133 struct CpuWorkerThreads { 134 int num_threads = 0; 135 thread::ThreadPool* workers = nullptr; 136 }; 137 138 // Does not take ownership. set_tensorflow_cpu_worker_threads(CpuWorkerThreads * t)139 void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) { 140 cpu_worker_threads_ = t; 141 } 142 tensorflow_cpu_worker_threads()143 virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const { 144 CHECK(cpu_worker_threads_ != nullptr); 145 return cpu_worker_threads_; 146 } 147 148 // "stream" is used in special circumstances (such as the 149 // constructors of Ops) where there is no available OpKernelContext. 150 // "default_context" is used by OpKernelContext whenever a device does not 151 // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only 152 // using a single stream.) 153 // "event_mgr" is used to delay deallocation of temporary GPU buffers. 154 // TODO(pbar) Work out how to move this out of DeviceBase. 155 struct AcceleratorDeviceInfo { 156 // Make sure all the defaults are NULL, so we can spot missing assignments. 157 stream_executor::Stream* stream = nullptr; 158 DeviceContext* default_context = nullptr; 159 EventMgr* event_mgr = nullptr; 160 int gpu_id = -1; 161 }; 162 163 // Does not take ownership. set_tensorflow_accelerator_device_info(AcceleratorDeviceInfo * device_info)164 void set_tensorflow_accelerator_device_info( 165 AcceleratorDeviceInfo* device_info) { 166 accelerator_device_info_ = device_info; 167 } 168 tensorflow_accelerator_device_info()169 virtual const AcceleratorDeviceInfo* tensorflow_accelerator_device_info() 170 const { 171 return accelerator_device_info_; 172 } 173 174 // The preferred thread pool for this device. If it is nullptr, the system 175 // automatically assigns a thread pool for execution. tensorflow_device_thread_pool()176 virtual thread::ThreadPool* tensorflow_device_thread_pool() { 177 return device_thread_pool_; 178 } 179 180 // Does not take ownership. 181 void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d); 182 183 // Return the Allocator implementation to use based on the allocator 184 // attributes requested. See allocator.h for more details. GetAllocator(AllocatorAttributes)185 virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) { 186 LOG(FATAL) << "GetAllocator() is not implemented."; 187 return nullptr; 188 } 189 190 // This method is provided for backwards compatibility, and will be removed 191 // in a future release. 192 ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.") GetStepAllocator(AllocatorAttributes attr,ResourceMgr *)193 Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) { 194 return GetAllocator(attr); 195 } 196 197 // Return an Allocator prepared for use in particular places by graph 198 // optimization GetScopedAllocator(AllocatorAttributes attr,int64_t step_id)199 virtual Allocator* GetScopedAllocator(AllocatorAttributes attr, 200 int64_t step_id) { 201 LOG(FATAL) << "Device does not implement GetScopedAllocator()"; 202 return nullptr; 203 } 204 GetScopedAllocatorMgr()205 virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; } 206 has_eigen_cpu_device()207 virtual bool has_eigen_cpu_device() const { 208 return !eigen_cpu_devices_.empty(); 209 } 210 211 virtual const Eigen::ThreadPoolDevice* eigen_cpu_device(); 212 213 // Caller owns the return value. The OpKernelContext calls this even 214 // for devices that do not implement an eigen_gpu_device. Overridden 215 // by GPU devices to return a derived type. MakeGpuDevice()216 virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; } 217 UnderlyingDevice()218 virtual DeviceBase* UnderlyingDevice() { return this; } UnderlyingDevice()219 virtual const DeviceBase* UnderlyingDevice() const { return this; } 220 221 // This is overridden by GPU devices to reinitialize the derived 222 // type returned by MakeGpuDevice. ReinitializeGpuDevice(OpKernelContext *,PerOpGpuDevice *,DeviceContext *,Allocator *)223 virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/, 224 PerOpGpuDevice* /*device*/, 225 DeviceContext* /*dc*/, 226 Allocator* /*allocator*/) { 227 return OkStatus(); 228 } 229 230 // Unimplemented by default 231 virtual const DeviceAttributes& attributes() const; NumaNode()232 virtual int NumaNode() const { return attributes().locality().numa_node(); } 233 virtual const std::string& name() const; 234 virtual const DeviceNameUtils::ParsedName& parsed_name() const; 235 236 // Updates `attributes()`, indicating the XLA global ID associated with this 237 // device. This ID is unique across clients in a multi-client setup. For TPUs 238 // this does not happen until the TPU system has been initialized. 239 // 240 // Implemented in Device. set_xla_global_id(int64_t id)241 virtual void set_xla_global_id(int64_t id) {} 242 243 // Materializes the given TensorProto into 'tensor' stored in Device 244 // memory. Most devices will want to override this. 245 // 246 // TODO(vrv): We should be able to put this function into 247 // OpKernelContext and handle the copies from device memory via send 248 // and receive nodes, instead of requiring that each device handle 249 // the copies here as well as in copy ops. MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)250 virtual Status MakeTensorFromProto(const TensorProto& tensor_proto, 251 const AllocatorAttributes alloc_attrs, 252 Tensor* tensor) { 253 return errors::Internal("Device does not implement MakeTensorFromProto()"); 254 } 255 256 // Some devices (i.e. GPUs) may free device memory prior to its actual use 257 // being completed on the assumption that subsequent allocations can only be 258 // used serially with respect to pending uses. If this function returns a 259 // non-zero value it is the value of a device-specific counter such that any 260 // device memory tagged with an earlier freed-at count is really unencumbered 261 // by pending uses. For this to be useful the device memory allocator must 262 // be tagging deallocated memory chunks using the same counter. SafeAllocFrontier(uint64 old_value)263 virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; } 264 265 // Copies `input_tensor` to `output_tensor`, where both tensors are on this 266 // device. This function assumes that `output_tensor` has already been 267 // allocated with a buffer that is large enough to hold `input_tensor`'s data. 268 // Calls `done` from a device-specific thread after copy is finished, which 269 // may be the same as calling thread. 270 // 271 // NOTE(ayushd): This function is for TensorFlow internal use only. Deep copy 272 // is discouraged and should not be used in OpKernels. CopyTensorInSameDevice(const Tensor * input_tensor,Tensor * output_tensor,const DeviceContext * device_context,StatusCallback done)273 virtual void CopyTensorInSameDevice(const Tensor* input_tensor, 274 Tensor* output_tensor, 275 const DeviceContext* device_context, 276 StatusCallback done) { 277 done(errors::Internal("Device ", name(), " does not implement ", 278 "CopyTensorInSameDevice")); 279 } 280 281 protected: 282 // Does not take ownership. set_tensorflow_device_thread_pool(thread::ThreadPool * thread_pool)283 void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) { 284 device_thread_pool_ = thread_pool; 285 } 286 287 private: 288 Env* const env_; 289 CpuWorkerThreads* cpu_worker_threads_ = nullptr; 290 // Set by GPUs as well as by TPU devices. 291 AcceleratorDeviceInfo* accelerator_device_info_ = nullptr; 292 thread::ThreadPool* device_thread_pool_ = nullptr; 293 std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_; 294 }; 295 296 // Methods to create and check for Symbolic execution devices. 297 // Such devices are mostly used for TF-XLA bridge. TF should not treat these as 298 // normal devices. 299 void AddSymbolicExecutionDevice(absl::string_view device_name); 300 bool IsSymbolicExecutionDevice(absl::string_view device_name); 301 302 } // namespace tensorflow 303 304 #endif // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 305