• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Suite of types that represent device memory allocations. These are
17 // allocated by the StreamExecutor interface, which produces values appropriate
18 // for the underlying platform (whether it be CUDA or OpenCL).
19 //
20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can
21 // be specialized for a given allocation type (like a device T*) using
22 // DeviceMemory<T>.
23 
24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
26 
27 #include <stddef.h>
28 
29 #include "tensorflow/stream_executor/lib/casts.h"
30 #include "tensorflow/stream_executor/platform/port.h"
31 
32 namespace perftools {
33 namespace gputools {
34 
35 class StreamExecutor;
36 
37 // void*-analogous device memory allocation. For the typed variation, see
38 // DeviceMemory<T>.
39 //
40 // This is effectively a two-tuple of a pointer and size; however, note that the
41 // pointer may not be to the virtual address itself -- in OpenCL the pointer is
42 // to a cl_mem handle that describes the device allocation. Therefore,
43 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be
44 // referenced directly, so use it with caution.
45 //
46 // Thread-compatible.
47 class DeviceMemoryBase {
48  public:
49   // Default constructor instantiates a null-pointed, zero-sized device memory
50   // region. An opaque pointer may be provided -- see header for details on the
51   // opacity of that pointer.
52   explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0,
53                             bool is_sub_buffer = false)
opaque_(opaque)54       : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {}
55 
56   // Returns whether the backing memory is the null pointer.
57   // A `== nullptr` convenience method is also provided.
is_null()58   bool is_null() const { return opaque_ == nullptr; }
59   bool operator==(std::nullptr_t other) const { return is_null(); }
60   bool operator!=(std::nullptr_t other) const { return !is_null(); }
61 
62   // Provides a partial order between device memory values.
63   //
64   // This operator is provided so that this object can be used as a key in an
65   // ordered map.
66   bool operator<(const DeviceMemoryBase &other) const {
67     return opaque() < other.opaque();
68   }
69 
70   // Returns the size, in bytes, for the backing memory.
size()71   uint64 size() const { return size_; }
72 
73   // Warning: note that the pointer returned is not necessarily directly to
74   // device virtual address space, but is platform-dependent.
opaque()75   void *opaque() { return opaque_; }
opaque()76   const void *opaque() const { return opaque_; }
77 
78   // Returns true if this is an offset into another primary allocation.
is_sub_buffer()79   bool is_sub_buffer() const { return is_sub_buffer_; }
80 
81   // Returns whether the two DeviceMemoryBase segments are identical (both in
82   // their opaque pointer and size).
IsSameAs(const DeviceMemoryBase & other)83   bool IsSameAs(const DeviceMemoryBase &other) const {
84     return opaque() == other.opaque() && size() == other.size();
85   }
86 
87  protected:
88   friend class StreamExecutor;
89 
90   // Resets the internal values of the opaque pointer and number of bytes in the
91   // memory region, just as in the constructor.
Reset(void * opaque,uint64 bytes)92   void Reset(void *opaque, uint64 bytes) {
93     opaque_ = opaque;
94     size_ = bytes;
95   }
96 
97  private:
98   void *opaque_;  // Platform-dependent value representing allocated memory.
99   uint64 size_;   // Size in bytes of this allocation.
100   bool is_sub_buffer_;  // Is this a primary allocation or a sub-buffer?
101 };
102 
103 // Typed wrapper around "void *"-like DeviceMemoryBase.
104 //
105 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
106 // that represents one or more integers in Device memory.
107 //
108 // Thread-compatible.
109 template <typename ElemT>
110 class DeviceMemory final : public DeviceMemoryBase {
111  public:
112   // Default constructor instantiates a null-pointed, zero-sized memory region.
DeviceMemory()113   DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
DeviceMemory(std::nullptr_t)114   DeviceMemory(std::nullptr_t) : DeviceMemory() {}
115 
116   // Typed device memory regions may be constructed from untyped device memory
117   // regions, this effectively amounts to a cast from a void*.
DeviceMemory(const DeviceMemoryBase & other)118   explicit DeviceMemory(const DeviceMemoryBase &other)
119       : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
120                          other.size(), other.is_sub_buffer()) {}
121 
122   // Returns the number of elements of type ElemT that constitute this
123   // allocation.
ElementCount()124   uint64 ElementCount() const { return size() / sizeof(ElemT); }
125 
126   // Returns whether this is a single-element allocation.
IsScalar()127   bool IsScalar() const { return ElementCount() == 1; }
128 
129   // Create a typed area of DeviceMemory with a given opaque pointer and the
130   // quantity of bytes in the allocation. This function is broken out to
131   // distinguish bytes from an element count.
MakeFromByteSize(void * opaque,uint64 bytes)132   static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) {
133     return DeviceMemory<ElemT>(opaque, bytes);
134   }
135 
136   // Resets the DeviceMemory data, in MakeFromByteSize fashion.
137   // This simply clobbers the prior values.
ResetFromByteSize(void * opaque,uint64 bytes)138   void ResetFromByteSize(void *opaque, uint64 bytes) {
139     // TODO(leary) when NVCC is eliminated we can add this check (and the
140     // logging include it requires).
141     // CHECK_EQ(0, bytes % sizeof(ElemT));
142     DeviceMemoryBase::Reset(opaque, bytes);
143   }
144 
145   // ------------------------------------------------------------
146 
147  protected:
148   // This constructor is solely used from derived classes; it is made protected
149   // because it accepts a byte-size instead of an element count, which could
150   // potentially be misused given the ElementCount() nature of this interface.
151   //
152   // In order to specify the desire to use byte size instead of element count
153   // explicitly, use MakeFromByteSize.
DeviceMemory(void * opaque,uint64 size)154   DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {}
155 };
156 
157 // A class to encapsulate the type and size of a dynamic shared memory
158 // buffer. Because the buffer exists solely on the device and is not copyable
159 // to the host, memory objects of this type do not maintain buffer pointers
160 // on the host.
161 template <typename ElemT>
162 class SharedDeviceMemory final : public DeviceMemoryBase {
163  public:
SharedDeviceMemory(uint64 elem_count)164   explicit SharedDeviceMemory(uint64 elem_count)
165       : DeviceMemoryBase(nullptr, elem_count * kElemSize) {}
166 
167   static constexpr size_t kElemSize = sizeof(ElemT);
168 
169   // Returns the number of elements of type ElemT that constitute this
170   // allocation.
ElementCount()171   uint64 ElementCount() const { return size() / kElemSize; }
172 
173   // Returns whether this is a single-element allocation.
IsScalar()174   bool IsScalar() const { return ElementCount() == 1; }
175 };
176 
177 // Similar to the typed DeviceMemory, but is the unique owner of its
178 // memory, if any. ScopedDeviceMemory is thread-compatible. It is also
179 // movable and uncopyable to represent unique ownership.
180 template <typename ElemT>
181 class ScopedDeviceMemory {
182  public:
183   // Default construction initializes the internal state to nullptr.  This
184   // mirrors the std::unique_ptr<> functionality, where default construction
185   // produces a nullptr unique_ptr, which can be assigned later.
186   ScopedDeviceMemory();
187 
188   // Parameters:
189   //  parent: Executor used to deallocate memory when this instance goes
190   //          out of scope.
191   //  value: Already-allocated device memory value for this scoped mechanism to
192   //         deallocate. This memory must have been allocated by parent.
193   ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value);
194 
195   // Constructor overload that places a literal array into device memory
196   ScopedDeviceMemory(StreamExecutor *parent,
197                      std::initializer_list<ElemT> values);
198 
199   // Moves ownership of the memory from other to the constructed
200   // object.
201   //
202   // Postcondition: other == nullptr.
ScopedDeviceMemory(ScopedDeviceMemory && other)203   ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept:
204       ScopedDeviceMemory(other.parent_, other.Release()) {}
205 
206   // Releases the memory that was provided in the constructor, through the
207   // "parent" StreamExecutor.
208   ~ScopedDeviceMemory();
209 
210   // Moves ownership of the memory from other to this object.
211   //
212   // Postcondition: other == nullptr.
213   ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) {
214     Reset(other.Release());
215     parent_ = other.parent_;
216     return *this;
217   }
218 
219   // Returns the memory that backs this scoped allocation converted to
220   // DeviceMemory<T> apparent type. This is useful for cases where the
221   // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't
222   // allow copying, for scoped-object-lifetime reasons.
cref()223   const DeviceMemory<ElemT> &cref() const { return wrapped_; }
224 
225   // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable
226   // operations. The value returned should not be used outside the scope of this
227   // ScopedDeviceMemory object's lifetime.
ptr()228   DeviceMemory<ElemT> *ptr() { return &wrapped_; }
ptr()229   const DeviceMemory<ElemT> *ptr() const { return &wrapped_; }
230 
231   // Smart-pointer-like operators for the wrapped DeviceMemory.
232   // This reference must not be used outside the lifetime of this
233   // ScopedDeviceMemory.
234   const DeviceMemory<ElemT> &operator*() const { return cref(); }
235   DeviceMemory<ElemT> *operator->() { return ptr(); }
236   const DeviceMemory<ElemT> *operator->() const { return ptr(); }
237   bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); }
238   bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); }
239 
240   // Analogous to std::unique_ptr::reset, frees the existing memory held in
241   // this scoped memory container and replaces it with updated. Ownership
242   // of updated is transferred to this object.
243   void Reset(DeviceMemory<ElemT> updated);
244   void Reset(std::nullptr_t);
245 
246   // Analogous to std::unique_ptr::release, releases ownership of the held
247   // memory and transfers it to the caller.
248   //
249   // Postcondition: *this == nullptr
Release()250   DeviceMemory<ElemT> Release() {
251     auto tmp = wrapped_;
252     wrapped_.ResetFromByteSize(nullptr, 0);
253     return tmp;
254   }
255 
256  private:
257   DeviceMemory<ElemT> wrapped_;  // Value we wrap with scoped-release.
258   StreamExecutor *parent_;       // See constructor.
259 
260   SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory);
261 };
262 
263 // Host-side representation of packed-and-aligned vector datatypes on the device
264 // side. Since these can appear in device kernel signatures, we support
265 // launching them with these datatypes in launch signatures.
266 
267 struct Float2 {
268   float x, y;
269 };
270 
271 struct Float4 {
272   Float2 xz, yw;
273 };
274 
275 struct Double2 {
276   double x, y;
277 };
278 
279 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
280 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
281 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
282 
283 }  // namespace gputools
284 }  // namespace perftools
285 
286 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
287