1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 18 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "absl/base/macros.h" 24 #include "absl/strings/string_view.h" 25 #include "tensorflow/core/framework/device_attributes.pb.h" 26 #include "tensorflow/core/framework/tensor.h" 27 #include "tensorflow/core/lib/core/errors.h" 28 #include "tensorflow/core/lib/core/refcount.h" 29 #include "tensorflow/core/lib/core/status.h" 30 #include "tensorflow/core/lib/core/stringpiece.h" 31 #include "tensorflow/core/platform/logging.h" 32 33 namespace Eigen { 34 struct ThreadPoolDevice; 35 #ifdef TENSORFLOW_USE_SYCL 36 struct SyclDevice; 37 #endif 38 } // end namespace Eigen 39 40 namespace stream_executor { 41 class Stream; 42 } // namespace stream_executor 43 44 namespace tensorflow { 45 46 class Device; 47 class DeviceAttributes; 48 class Env; 49 class EventMgr; 50 class OpKernelContext; 51 class ResourceMgr; 52 class ScopedAllocatorMgr; 53 class TensorProto; 54 55 namespace thread { 56 class ThreadPool; 57 } 58 59 // A wrapper for an Eigen Gpu Device that includes per-op state. The 60 // class is defined even for non-GPU devices since the 61 // OpKernelContext::Params structure wants to fill it in. 62 class PerOpGpuDevice { 63 public: ~PerOpGpuDevice()64 virtual ~PerOpGpuDevice() {} 65 virtual const Eigen::GpuDevice& device() const = 0; 66 }; 67 68 // A class that devices can subclass to pass around 69 // Device-specific context to OpKernels. 70 class DeviceContext : public core::RefCounted { 71 public: ~DeviceContext()72 ~DeviceContext() override {} stream()73 virtual stream_executor::Stream* stream() const { return nullptr; } MaintainLifetimeOnStream(const Tensor * t,stream_executor::Stream * stream)74 virtual void MaintainLifetimeOnStream(const Tensor* t, 75 stream_executor::Stream* stream) const { 76 } 77 78 // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into 79 // "device_tensor" which is on a non-CPU device "device". "device_tensor" 80 // must be allocated to be of the same size as "cpu_tensor". 81 virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, 82 Tensor* device_tensor, StatusCallback done, 83 bool sync_dst_compute = true) const { 84 done(errors::Internal("Unrecognized device type in CPU-to-device Copy")); 85 } 86 87 // Copies a tensor in this device. CopyTensorInSameDevice(const Tensor * input_tensor,Device * device,Tensor * output_tensor,StatusCallback done)88 virtual void CopyTensorInSameDevice(const Tensor* input_tensor, 89 Device* device, Tensor* output_tensor, 90 StatusCallback done) const { 91 done(errors::Unimplemented("Copy in same device not implemented.")); 92 } 93 94 // "device_tensor" is a tensor on a non-CPU device. Copies 95 // device_tensor into "cpu_tensor". "cpu_tensor" must be allocated 96 // to be of the same size as "device_tensor". CopyDeviceTensorToCPU(const Tensor * device_tensor,StringPiece tensor_name,Device * device,Tensor * cpu_tensor,StatusCallback done)97 virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor, 98 StringPiece tensor_name, Device* device, 99 Tensor* cpu_tensor, StatusCallback done) { 100 done(errors::Internal("Unrecognized device type in device-to-CPU Copy")); 101 } 102 103 // If possible, wait for all events on *stream to complete then execute func. 104 // A non-OK Status is returned otherwise. The stream argument should be the 105 // one provided by GpuDeviceInfo. This function is not applicable to devices 106 // that don't provide such a value. ThenExecute(Device * device,stream_executor::Stream * stream,std::function<void ()> func)107 virtual Status ThenExecute(Device* device, stream_executor::Stream* stream, 108 std::function<void()> func) { 109 return errors::Internal("ThenExecute not supported by device"); 110 } 111 }; 112 113 // map[i] is the DeviceContext* for the node with id i, if i < map.size(). 114 typedef std::vector<DeviceContext*> DeviceContextMap; 115 116 class DeviceBase { 117 public: DeviceBase(Env * env)118 explicit DeviceBase(Env* env) : env_(env) {} 119 virtual ~DeviceBase(); 120 env()121 Env* env() const { return env_; } 122 123 // Override this to return true for devices that require an Op's 124 // compute method to save references to the temporary tensors it 125 // allocates until the Op execution completes RequiresRecordingAccessedTensors()126 virtual bool RequiresRecordingAccessedTensors() const { return false; } 127 128 struct CpuWorkerThreads { 129 int num_threads = 0; 130 thread::ThreadPool* workers = nullptr; 131 }; 132 133 // Does not take ownership. set_tensorflow_cpu_worker_threads(CpuWorkerThreads * t)134 void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) { 135 cpu_worker_threads_ = t; 136 } 137 tensorflow_cpu_worker_threads()138 virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const { 139 CHECK(cpu_worker_threads_ != nullptr); 140 return cpu_worker_threads_; 141 } 142 143 // "stream" is used in special circumstances (such as the 144 // constructors of Ops) where there is no available OpKernelContext. 145 // "default_context" is used by OpKernelContext whenever a device does not 146 // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only 147 // using a single stream.) 148 // "event_mgr" is used to delay deallocation of temporary GPU buffers. 149 // TODO(pbar) Work out how to move this out of DeviceBase. 150 // GpuDeviceInfo name is an unfortunate legacy, it is used not only by GPUs 151 // but also by TPU devices (to provide default device context). 152 struct GpuDeviceInfo { 153 // Make sure all the defaults are NULL, so we can spot missing assignments. 154 stream_executor::Stream* stream = nullptr; 155 DeviceContext* default_context = nullptr; 156 EventMgr* event_mgr = nullptr; 157 int gpu_id = -1; 158 }; 159 160 // Does not take ownership. set_tensorflow_gpu_device_info(GpuDeviceInfo * g)161 void set_tensorflow_gpu_device_info(GpuDeviceInfo* g) { 162 gpu_device_info_ = g; 163 } 164 tensorflow_gpu_device_info()165 virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const { 166 return gpu_device_info_; 167 } 168 169 // The preferred thread pool for this device. If it is nullptr, the system 170 // automatically assigns a thread pool for execution. tensorflow_device_thread_pool()171 virtual thread::ThreadPool* tensorflow_device_thread_pool() { 172 return device_thread_pool_; 173 } 174 175 // Does not take ownership. 176 void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d); 177 178 #ifdef TENSORFLOW_USE_SYCL set_eigen_sycl_device(Eigen::SyclDevice * d)179 void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; } 180 #endif 181 182 // Return the Allocator implementation to use based on the allocator 183 // attributes requested. See allocator.h for more details. GetAllocator(AllocatorAttributes)184 virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) { 185 LOG(FATAL) << "GetAllocator() is not implemented."; 186 return nullptr; 187 } 188 189 // This method is provided for backwards compatibility, and will be removed 190 // in a future release. 191 ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.") GetStepAllocator(AllocatorAttributes attr,ResourceMgr *)192 Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) { 193 return GetAllocator(attr); 194 } 195 196 // Return an Allocator prepared for use in particular places by graph 197 // optimization GetScopedAllocator(AllocatorAttributes attr,int64 step_id)198 virtual Allocator* GetScopedAllocator(AllocatorAttributes attr, 199 int64 step_id) { 200 LOG(FATAL) << "Device does not implement GetScopedAllocator()"; 201 return nullptr; 202 } 203 GetScopedAllocatorMgr()204 virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; } 205 has_eigen_cpu_device()206 virtual bool has_eigen_cpu_device() const { 207 return !eigen_cpu_devices_.empty(); 208 } 209 210 virtual const Eigen::ThreadPoolDevice* eigen_cpu_device(); 211 212 #ifdef TENSORFLOW_USE_SYCL eigen_sycl_device()213 virtual const Eigen::SyclDevice* eigen_sycl_device() const { 214 CHECK(eigen_sycl_device_ != nullptr); 215 return eigen_sycl_device_; 216 } 217 #endif 218 219 // Caller owns the return value. The OpKernelContext calls this even 220 // for devices that do not implement an eigen_gpu_device. Overridden 221 // by GPU devices to return a derived type. MakeGpuDevice()222 virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; } 223 UnderlyingDevice()224 virtual DeviceBase* UnderlyingDevice() { return this; } UnderlyingDevice()225 virtual const DeviceBase* UnderlyingDevice() const { return this; } 226 227 // This is overridden by GPU devices to reinitialize the derived 228 // type returned by MakeGpuDevice. ReinitializeGpuDevice(OpKernelContext *,PerOpGpuDevice *,DeviceContext *,Allocator *)229 virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/, 230 PerOpGpuDevice* /*device*/, 231 DeviceContext* /*dc*/, 232 Allocator* /*allocator*/) { 233 return Status::OK(); 234 } 235 236 // Unimplemented by default 237 virtual const DeviceAttributes& attributes() const; NumaNode()238 virtual int NumaNode() const { return attributes().locality().numa_node(); } 239 virtual const string& name() const; 240 241 // Materializes the given TensorProto into 'tensor' stored in Device 242 // memory. Most devices will want to override this. 243 // 244 // TODO(vrv): We should be able to put this function into 245 // OpKernelContext and handle the copies from device memory via send 246 // and receive nodes, instead of requiring that each device handle 247 // the copies here as well as in copy ops. MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)248 virtual Status MakeTensorFromProto(const TensorProto& tensor_proto, 249 const AllocatorAttributes alloc_attrs, 250 Tensor* tensor) { 251 return errors::Internal("Device does not implement MakeTensorFromProto()"); 252 } 253 254 // Some devices (i.e. GPUs) may free device memory prior to its actual use 255 // being completed on the assumption that subsequent allocations can only be 256 // used serially with respect to pending uses. If this function returns a 257 // non-zero value it is the value of a device-specific counter such that any 258 // device memory tagged with an earlier freed-at count is really unencumbered 259 // by pending uses. For this to be useful the device memory allocator must 260 // be tagging deallocated memory chunks using the same counter. SafeAllocFrontier(uint64 old_value)261 virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; } 262 263 // Copies `input_tensor` to `output_tensor`, where both tensors are on this 264 // device. This function assumes that `output_tensor` has already been 265 // allocated with a buffer that is large enough to hold `input_tensor`'s data. 266 // Calls `done` from a device-specific thread after copy is finished, which 267 // may be the same as calling thread. 268 // 269 // NOTE(ayushd): This function is for TensorFlow internal use only. Deep copy 270 // is discouraged and should not be used in OpKernels. CopyTensorInSameDevice(const Tensor * input_tensor,Tensor * output_tensor,const DeviceContext * device_context,StatusCallback done)271 virtual void CopyTensorInSameDevice(const Tensor* input_tensor, 272 Tensor* output_tensor, 273 const DeviceContext* device_context, 274 StatusCallback done) { 275 done(errors::Internal("Device ", name(), " does not implement ", 276 "CopyTensorInSameDevice")); 277 } 278 279 protected: 280 // Does not take ownership. set_tensorflow_device_thread_pool(thread::ThreadPool * thread_pool)281 void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) { 282 device_thread_pool_ = thread_pool; 283 } 284 285 private: 286 Env* const env_; 287 CpuWorkerThreads* cpu_worker_threads_ = nullptr; 288 // Set by GPUs as well as by TPU devices. 289 GpuDeviceInfo* gpu_device_info_ = nullptr; 290 thread::ThreadPool* device_thread_pool_ = nullptr; 291 std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_; 292 #ifdef TENSORFLOW_USE_SYCL 293 Eigen::SyclDevice* eigen_sycl_device_ = nullptr; 294 #endif 295 }; 296 297 // Methods to create and check for Symbolic execution devices. 298 // Such devices are mostly used for TF-XLA bridge. TF should not treat these as 299 // normal devices. 300 void AddSymbolicExecutionDevice(absl::string_view device_name); 301 bool IsSymbolicExecutionDevice(absl::string_view device_name); 302 303 } // namespace tensorflow 304 305 #endif // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ 306