• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
18 
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "absl/base/macros.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/framework/device_attributes.pb.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/lib/core/errors.h"
28 #include "tensorflow/core/lib/core/refcount.h"
29 #include "tensorflow/core/lib/core/status.h"
30 #include "tensorflow/core/lib/core/stringpiece.h"
31 #include "tensorflow/core/platform/logging.h"
32 #include "tensorflow/core/util/device_name_utils.h"
33 
34 namespace Eigen {
35 struct ThreadPoolDevice;
36 }  // end namespace Eigen
37 
38 namespace stream_executor {
39 class Stream;
40 }  // namespace stream_executor
41 
42 namespace tensorflow {
43 
44 class Device;
45 class DeviceAttributes;
46 class Env;
47 class EventMgr;
48 class OpKernelContext;
49 class ResourceMgr;
50 class ScopedAllocatorMgr;
51 class TensorProto;
52 
53 namespace thread {
54 class ThreadPool;
55 }
56 
57 // A wrapper for an Eigen Gpu Device that includes per-op state. The
58 // class is defined even for non-GPU devices since the
59 // OpKernelContext::Params structure wants to fill it in.
60 class PerOpGpuDevice {
61  public:
~PerOpGpuDevice()62   virtual ~PerOpGpuDevice() {}
63   virtual const Eigen::GpuDevice& device() const = 0;
64 };
65 
66 // A class that devices can subclass to pass around
67 // Device-specific context to OpKernels.
68 class DeviceContext : public core::RefCounted {
69  public:
~DeviceContext()70   ~DeviceContext() override {}
stream()71   virtual stream_executor::Stream* stream() const { return nullptr; }
MaintainLifetimeOnStream(const Tensor * t,stream_executor::Stream * stream)72   virtual void MaintainLifetimeOnStream(const Tensor* t,
73                                         stream_executor::Stream* stream) const {
74   }
75 
76   // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
77   // "device_tensor" which is on a non-CPU device "device". "device_tensor"
78   // must be allocated to be of the same size as "cpu_tensor".
79   virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
80                                      Tensor* device_tensor, StatusCallback done,
81                                      bool sync_dst_compute = true) const {
82     done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
83   }
84 
85   // Same as CopyCPUTensorToDevice, but in a synchronous way.
86   Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device,
87                                    Tensor* device_tensor) const;
88 
89   // Copies a tensor in this device.
CopyTensorInSameDevice(const Tensor * input_tensor,Device * device,Tensor * output_tensor,StatusCallback done)90   virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
91                                       Device* device, Tensor* output_tensor,
92                                       StatusCallback done) const {
93     done(errors::Unimplemented("Copy in same device not implemented."));
94   }
95 
96   // "device_tensor" is a tensor on a non-CPU device.  Copies
97   // device_tensor into "cpu_tensor".  "cpu_tensor" must be allocated
98   // to be of the same size as "device_tensor".
CopyDeviceTensorToCPU(const Tensor * device_tensor,StringPiece tensor_name,Device * device,Tensor * cpu_tensor,StatusCallback done)99   virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor,
100                                      StringPiece tensor_name, Device* device,
101                                      Tensor* cpu_tensor, StatusCallback done) {
102     done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
103   }
104 
105   // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done.
106   Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor,
107                                    StringPiece tensor_name, Device* device,
108                                    Tensor* cpu_tensor);
109 
110   // If possible, wait for all events on *stream to complete then execute func.
111   // A non-OK Status is returned otherwise.  The stream argument should be the
112   // one provided by AcceleratorDeviceInfo.  This function is not applicable to
113   // devices that don't provide such a value.
ThenExecute(Device * device,stream_executor::Stream * stream,std::function<void ()> func)114   virtual Status ThenExecute(Device* device, stream_executor::Stream* stream,
115                              std::function<void()> func) {
116     return errors::Internal("ThenExecute not supported by device");
117   }
118 
119   // check if device is a pluggable device
IsPluggableDevice()120   virtual bool IsPluggableDevice() { return false; }
121 
122   // Returns the pinned host memory allocator for the device.
host_memory_allocator()123   virtual Allocator* host_memory_allocator() const { return nullptr; }
124 };
125 
126 class DeviceBase {
127  public:
DeviceBase(Env * env)128   explicit DeviceBase(Env* env) : env_(env) {}
129   virtual ~DeviceBase();
130 
env()131   Env* env() const { return env_; }
132 
133   struct CpuWorkerThreads {
134     int num_threads = 0;
135     thread::ThreadPool* workers = nullptr;
136   };
137 
138   // Does not take ownership.
set_tensorflow_cpu_worker_threads(CpuWorkerThreads * t)139   void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) {
140     cpu_worker_threads_ = t;
141   }
142 
tensorflow_cpu_worker_threads()143   virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
144     CHECK(cpu_worker_threads_ != nullptr);
145     return cpu_worker_threads_;
146   }
147 
148   // "stream" is used in special circumstances (such as the
149   // constructors of Ops) where there is no available OpKernelContext.
150   // "default_context" is used by OpKernelContext whenever a device does not
151   // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only
152   // using a single stream.)
153   // "event_mgr" is used to delay deallocation of temporary GPU buffers.
154   // TODO(pbar) Work out how to move this out of DeviceBase.
155   struct AcceleratorDeviceInfo {
156     // Make sure all the defaults are NULL, so we can spot missing assignments.
157     stream_executor::Stream* stream = nullptr;
158     DeviceContext* default_context = nullptr;
159     EventMgr* event_mgr = nullptr;
160     int gpu_id = -1;
161   };
162 
163   // Does not take ownership.
set_tensorflow_accelerator_device_info(AcceleratorDeviceInfo * device_info)164   void set_tensorflow_accelerator_device_info(
165       AcceleratorDeviceInfo* device_info) {
166     accelerator_device_info_ = device_info;
167   }
168 
tensorflow_accelerator_device_info()169   virtual const AcceleratorDeviceInfo* tensorflow_accelerator_device_info()
170       const {
171     return accelerator_device_info_;
172   }
173 
174   // The preferred thread pool for this device. If it is nullptr, the system
175   // automatically assigns a thread pool for execution.
tensorflow_device_thread_pool()176   virtual thread::ThreadPool* tensorflow_device_thread_pool() {
177     return device_thread_pool_;
178   }
179 
180   // Does not take ownership.
181   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);
182 
183   // Return the Allocator implementation to use based on the allocator
184   // attributes requested.  See allocator.h for more details.
GetAllocator(AllocatorAttributes)185   virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
186     LOG(FATAL) << "GetAllocator() is not implemented.";
187     return nullptr;
188   }
189 
190   // This method is provided for backwards compatibility, and will be removed
191   // in a future release.
192   ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.")
GetStepAllocator(AllocatorAttributes attr,ResourceMgr *)193   Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) {
194     return GetAllocator(attr);
195   }
196 
197   // Return an Allocator prepared for use in particular places by graph
198   // optimization
GetScopedAllocator(AllocatorAttributes attr,int64_t step_id)199   virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
200                                         int64_t step_id) {
201     LOG(FATAL) << "Device does not implement GetScopedAllocator()";
202     return nullptr;
203   }
204 
GetScopedAllocatorMgr()205   virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
206 
has_eigen_cpu_device()207   virtual bool has_eigen_cpu_device() const {
208     return !eigen_cpu_devices_.empty();
209   }
210 
211   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
212 
213   // Caller owns the return value. The OpKernelContext calls this even
214   // for devices that do not implement an eigen_gpu_device. Overridden
215   // by GPU devices to return a derived type.
MakeGpuDevice()216   virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }
217 
UnderlyingDevice()218   virtual DeviceBase* UnderlyingDevice() { return this; }
UnderlyingDevice()219   virtual const DeviceBase* UnderlyingDevice() const { return this; }
220 
221   // This is overridden by GPU devices to reinitialize the derived
222   // type returned by MakeGpuDevice.
ReinitializeGpuDevice(OpKernelContext *,PerOpGpuDevice *,DeviceContext *,Allocator *)223   virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
224                                        PerOpGpuDevice* /*device*/,
225                                        DeviceContext* /*dc*/,
226                                        Allocator* /*allocator*/) {
227     return OkStatus();
228   }
229 
230   // Unimplemented by default
231   virtual const DeviceAttributes& attributes() const;
NumaNode()232   virtual int NumaNode() const { return attributes().locality().numa_node(); }
233   virtual const std::string& name() const;
234   virtual const DeviceNameUtils::ParsedName& parsed_name() const;
235 
236   // Updates `attributes()`, indicating the XLA global ID associated with this
237   // device. This ID is unique across clients in a multi-client setup. For TPUs
238   // this does not happen until the TPU system has been initialized.
239   //
240   // Implemented in Device.
set_xla_global_id(int64_t id)241   virtual void set_xla_global_id(int64_t id) {}
242 
243   // Materializes the given TensorProto into 'tensor' stored in Device
244   // memory.  Most devices will want to override this.
245   //
246   // TODO(vrv): We should be able to put this function into
247   // OpKernelContext and handle the copies from device memory via send
248   // and receive nodes, instead of requiring that each device handle
249   // the copies here as well as in copy ops.
MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)250   virtual Status MakeTensorFromProto(const TensorProto& tensor_proto,
251                                      const AllocatorAttributes alloc_attrs,
252                                      Tensor* tensor) {
253     return errors::Internal("Device does not implement MakeTensorFromProto()");
254   }
255 
256   // Some devices (i.e. GPUs) may free device memory prior to its actual use
257   // being completed on the assumption that subsequent allocations can only be
258   // used serially with respect to pending uses.  If this function returns a
259   // non-zero value it is the value of a device-specific counter such that any
260   // device memory tagged with an earlier freed-at count is really unencumbered
261   // by pending uses.  For this to be useful the device memory allocator must
262   // be tagging deallocated memory chunks using the same counter.
SafeAllocFrontier(uint64 old_value)263   virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }
264 
265   // Copies `input_tensor` to `output_tensor`, where both tensors are on this
266   // device. This function assumes that `output_tensor` has already been
267   // allocated with a buffer that is large enough to hold `input_tensor`'s data.
268   // Calls `done` from a device-specific thread after copy is finished, which
269   // may be the same as calling thread.
270   //
271   // NOTE(ayushd): This function is for TensorFlow internal use only.  Deep copy
272   // is discouraged and should not be used in OpKernels.
CopyTensorInSameDevice(const Tensor * input_tensor,Tensor * output_tensor,const DeviceContext * device_context,StatusCallback done)273   virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
274                                       Tensor* output_tensor,
275                                       const DeviceContext* device_context,
276                                       StatusCallback done) {
277     done(errors::Internal("Device ", name(), " does not implement ",
278                           "CopyTensorInSameDevice"));
279   }
280 
281  protected:
282   // Does not take ownership.
set_tensorflow_device_thread_pool(thread::ThreadPool * thread_pool)283   void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
284     device_thread_pool_ = thread_pool;
285   }
286 
287  private:
288   Env* const env_;
289   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
290   // Set by GPUs as well as by TPU devices.
291   AcceleratorDeviceInfo* accelerator_device_info_ = nullptr;
292   thread::ThreadPool* device_thread_pool_ = nullptr;
293   std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
294 };
295 
296 // Methods to create and check for Symbolic execution devices.
297 // Such devices are mostly used for TF-XLA bridge. TF should not treat these as
298 // normal devices.
299 void AddSymbolicExecutionDevice(absl::string_view device_name);
300 bool IsSymbolicExecutionDevice(absl::string_view device_name);
301 
302 }  // namespace tensorflow
303 
304 #endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
305