1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Suite of types that represent device memory allocations. These are 17 // allocated by the StreamExecutor interface, which produces values appropriate 18 // for the underlying platform (whether it be CUDA or OpenCL). 19 // 20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can 21 // be specialized for a given allocation type (like a device T*) using 22 // DeviceMemory<T>. 23 24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 26 27 #include <stddef.h> 28 29 #include "tensorflow/stream_executor/platform/port.h" 30 31 namespace perftools { 32 namespace gputools { 33 34 // Temporarily pull stream_executor into perftools::gputools while we migrate 35 // code to the new namespace. TODO(b/77980417): Remove this once we've 36 // completed the migration. 37 using namespace stream_executor; // NOLINT[build/namespaces] 38 39 } // namespace gputools 40 } // namespace perftools 41 42 namespace stream_executor { 43 44 class StreamExecutor; 45 46 // void*-analogous device memory allocation. For the typed variation, see 47 // DeviceMemory<T>. 48 // 49 // This is effectively a two-tuple of a pointer and size; however, note that the 50 // pointer may not be to the virtual address itself -- in OpenCL the pointer is 51 // to a cl_mem handle that describes the device allocation. Therefore, 52 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be 53 // referenced directly, so use it with caution. 54 // 55 // Thread-compatible. 56 class DeviceMemoryBase { 57 public: 58 // Default constructor instantiates a null-pointed, zero-sized device memory 59 // region. An opaque pointer may be provided -- see header for details on the 60 // opacity of that pointer. 61 explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0, 62 bool is_sub_buffer = false) opaque_(opaque)63 : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {} 64 65 // Returns whether the backing memory is the null pointer. 66 // A `== nullptr` convenience method is also provided. is_null()67 bool is_null() const { return opaque_ == nullptr; } 68 bool operator==(std::nullptr_t other) const { return is_null(); } 69 bool operator!=(std::nullptr_t other) const { return !is_null(); } 70 71 // Provides a partial order between device memory values. 72 // 73 // This operator is provided so that this object can be used as a key in an 74 // ordered map. 75 bool operator<(const DeviceMemoryBase &other) const { 76 return opaque() < other.opaque(); 77 } 78 79 // Returns the size, in bytes, for the backing memory. size()80 uint64 size() const { return size_; } 81 82 // Warning: note that the pointer returned is not necessarily directly to 83 // device virtual address space, but is platform-dependent. opaque()84 void *opaque() { return opaque_; } opaque()85 const void *opaque() const { return opaque_; } 86 87 // Returns true if this is an offset into another primary allocation. is_sub_buffer()88 bool is_sub_buffer() const { return is_sub_buffer_; } 89 90 // Returns whether the two DeviceMemoryBase segments are identical (both in 91 // their opaque pointer and size). IsSameAs(const DeviceMemoryBase & other)92 bool IsSameAs(const DeviceMemoryBase &other) const { 93 return opaque() == other.opaque() && size() == other.size(); 94 } 95 96 protected: 97 friend class StreamExecutor; 98 99 // Resets the internal values of the opaque pointer and number of bytes in the 100 // memory region, just as in the constructor. Reset(void * opaque,uint64 bytes)101 void Reset(void *opaque, uint64 bytes) { 102 opaque_ = opaque; 103 size_ = bytes; 104 } 105 106 private: 107 void *opaque_; // Platform-dependent value representing allocated memory. 108 uint64 size_; // Size in bytes of this allocation. 109 bool is_sub_buffer_; // Is this a primary allocation or a sub-buffer? 110 }; 111 112 // Typed wrapper around "void *"-like DeviceMemoryBase. 113 // 114 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase 115 // that represents one or more integers in Device memory. 116 // 117 // Thread-compatible. 118 template <typename ElemT> 119 class DeviceMemory final : public DeviceMemoryBase { 120 public: 121 // Default constructor instantiates a null-pointed, zero-sized memory region. DeviceMemory()122 DeviceMemory() : DeviceMemoryBase(nullptr, 0) {} DeviceMemory(std::nullptr_t)123 DeviceMemory(std::nullptr_t) : DeviceMemory() {} 124 125 // Typed device memory regions may be constructed from untyped device memory 126 // regions, this effectively amounts to a cast from a void*. DeviceMemory(const DeviceMemoryBase & other)127 explicit DeviceMemory(const DeviceMemoryBase &other) 128 : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(), 129 other.size(), other.is_sub_buffer()) {} 130 131 // Returns the number of elements of type ElemT that constitute this 132 // allocation. ElementCount()133 uint64 ElementCount() const { return size() / sizeof(ElemT); } 134 135 // Returns whether this is a single-element allocation. IsScalar()136 bool IsScalar() const { return ElementCount() == 1; } 137 138 // Create a typed area of DeviceMemory with a given opaque pointer and the 139 // quantity of bytes in the allocation. This function is broken out to 140 // distinguish bytes from an element count. MakeFromByteSize(void * opaque,uint64 bytes)141 static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) { 142 return DeviceMemory<ElemT>(opaque, bytes); 143 } 144 145 // Resets the DeviceMemory data, in MakeFromByteSize fashion. 146 // This simply clobbers the prior values. ResetFromByteSize(void * opaque,uint64 bytes)147 void ResetFromByteSize(void *opaque, uint64 bytes) { 148 // TODO(leary) when NVCC is eliminated we can add this check (and the 149 // logging include it requires). 150 // CHECK_EQ(0, bytes % sizeof(ElemT)); 151 DeviceMemoryBase::Reset(opaque, bytes); 152 } 153 154 // ------------------------------------------------------------ 155 156 protected: 157 // This constructor is solely used from derived classes; it is made protected 158 // because it accepts a byte-size instead of an element count, which could 159 // potentially be misused given the ElementCount() nature of this interface. 160 // 161 // In order to specify the desire to use byte size instead of element count 162 // explicitly, use MakeFromByteSize. DeviceMemory(void * opaque,uint64 size)163 DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {} 164 }; 165 166 // A class to encapsulate the type and size of a dynamic shared memory 167 // buffer. Because the buffer exists solely on the device and is not copyable 168 // to the host, memory objects of this type do not maintain buffer pointers 169 // on the host. 170 template <typename ElemT> 171 class SharedDeviceMemory final : public DeviceMemoryBase { 172 public: SharedDeviceMemory(uint64 elem_count)173 explicit SharedDeviceMemory(uint64 elem_count) 174 : DeviceMemoryBase(nullptr, elem_count * kElemSize) {} 175 176 static constexpr size_t kElemSize = sizeof(ElemT); 177 178 // Returns the number of elements of type ElemT that constitute this 179 // allocation. ElementCount()180 uint64 ElementCount() const { return size() / kElemSize; } 181 182 // Returns whether this is a single-element allocation. IsScalar()183 bool IsScalar() const { return ElementCount() == 1; } 184 }; 185 186 // Similar to the typed DeviceMemory, but is the unique owner of its 187 // memory, if any. ScopedDeviceMemory is thread-compatible. It is also 188 // movable and uncopyable to represent unique ownership. 189 template <typename ElemT> 190 class ScopedDeviceMemory { 191 public: 192 // Default construction initializes the internal state to nullptr. This 193 // mirrors the std::unique_ptr<> functionality, where default construction 194 // produces a nullptr unique_ptr, which can be assigned later. 195 ScopedDeviceMemory(); 196 197 // Parameters: 198 // parent: Executor used to deallocate memory when this instance goes 199 // out of scope. 200 // value: Already-allocated device memory value for this scoped mechanism to 201 // deallocate. This memory must have been allocated by parent. 202 ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value); 203 204 // Constructor overload that places a literal array into device memory 205 ScopedDeviceMemory(StreamExecutor *parent, 206 std::initializer_list<ElemT> values); 207 208 // Moves ownership of the memory from other to the constructed 209 // object. 210 // 211 // Postcondition: other == nullptr. ScopedDeviceMemory(ScopedDeviceMemory && other)212 ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept: 213 ScopedDeviceMemory(other.parent_, other.Release()) {} 214 215 // Releases the memory that was provided in the constructor, through the 216 // "parent" StreamExecutor. 217 ~ScopedDeviceMemory(); 218 219 // Moves ownership of the memory from other to this object. 220 // 221 // Postcondition: other == nullptr. 222 ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) { 223 Reset(other.Release()); 224 parent_ = other.parent_; 225 return *this; 226 } 227 228 // Returns the memory that backs this scoped allocation converted to 229 // DeviceMemory<T> apparent type. This is useful for cases where the 230 // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't 231 // allow copying, for scoped-object-lifetime reasons. cref()232 const DeviceMemory<ElemT> &cref() const { return wrapped_; } 233 234 // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable 235 // operations. The value returned should not be used outside the scope of this 236 // ScopedDeviceMemory object's lifetime. ptr()237 DeviceMemory<ElemT> *ptr() { return &wrapped_; } ptr()238 const DeviceMemory<ElemT> *ptr() const { return &wrapped_; } 239 240 // Smart-pointer-like operators for the wrapped DeviceMemory. 241 // This reference must not be used outside the lifetime of this 242 // ScopedDeviceMemory. 243 const DeviceMemory<ElemT> &operator*() const { return cref(); } 244 DeviceMemory<ElemT> *operator->() { return ptr(); } 245 const DeviceMemory<ElemT> *operator->() const { return ptr(); } 246 bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); } 247 bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); } 248 249 // Analogous to std::unique_ptr::reset, frees the existing memory held in 250 // this scoped memory container and replaces it with updated. Ownership 251 // of updated is transferred to this object. 252 void Reset(DeviceMemory<ElemT> updated); 253 void Reset(std::nullptr_t); 254 255 // Analogous to std::unique_ptr::release, releases ownership of the held 256 // memory and transfers it to the caller. 257 // 258 // Postcondition: *this == nullptr Release()259 DeviceMemory<ElemT> Release() { 260 auto tmp = wrapped_; 261 wrapped_.ResetFromByteSize(nullptr, 0); 262 return tmp; 263 } 264 265 private: 266 DeviceMemory<ElemT> wrapped_; // Value we wrap with scoped-release. 267 StreamExecutor *parent_; // See constructor. 268 269 SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory); 270 }; 271 272 // Host-side representation of packed-and-aligned vector datatypes on the device 273 // side. Since these can appear in device kernel signatures, we support 274 // launching them with these datatypes in launch signatures. 275 276 struct Float2 { 277 float x, y; 278 }; 279 280 struct Float4 { 281 Float2 xz, yw; 282 }; 283 284 struct Double2 { 285 double x, y; 286 }; 287 288 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed"); 289 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed"); 290 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed"); 291 292 } // namespace stream_executor 293 294 #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 295