1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Suite of types that represent device memory allocations. These are 17 // allocated by the StreamExecutor interface, which produces values appropriate 18 // for the underlying platform (whether it be CUDA or OpenCL). 19 // 20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can 21 // be specialized for a given allocation type (like a device T*) using 22 // DeviceMemory<T>. 23 24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 26 27 #include <stddef.h> 28 29 #include "tensorflow/stream_executor/platform/port.h" 30 31 namespace perftools { 32 namespace gputools { 33 34 // Temporarily pull stream_executor into perftools::gputools while we migrate 35 // code to the new namespace. TODO(b/77980417): Remove this once we've 36 // completed the migration. 37 using namespace stream_executor; // NOLINT[build/namespaces] 38 39 } // namespace gputools 40 } // namespace perftools 41 42 namespace stream_executor { 43 44 class DeviceMemoryAllocator; 45 class StreamExecutor; 46 47 // void*-analogous device memory allocation. For the typed variation, see 48 // DeviceMemory<T>. 49 // 50 // This is effectively a two-tuple of a pointer and size; however, note that the 51 // pointer may not be to the virtual address itself -- in OpenCL the pointer is 52 // to a cl_mem handle that describes the device allocation. Therefore, 53 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be 54 // referenced directly, so use it with caution. 55 // 56 // Thread-compatible. 57 class DeviceMemoryBase { 58 public: 59 // Default constructor instantiates a null-pointed, zero-sized device memory 60 // region. An opaque pointer may be provided -- see header for details on the 61 // opacity of that pointer. 62 explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0) opaque_(opaque)63 : opaque_(opaque), size_(size) {} 64 65 // Returns whether the backing memory is the null pointer. 66 // A `== nullptr` convenience method is also provided. is_null()67 bool is_null() const { return opaque_ == nullptr; } 68 bool operator==(std::nullptr_t other) const { return is_null(); } 69 bool operator!=(std::nullptr_t other) const { return !is_null(); } 70 71 // Provides a partial order between device memory values. 72 // 73 // This operator is provided so that this object can be used as a key in an 74 // ordered map. 75 bool operator<(const DeviceMemoryBase &other) const { 76 return opaque() < other.opaque(); 77 } 78 79 // Returns the size, in bytes, for the backing memory. size()80 uint64 size() const { return size_; } 81 82 // Warning: note that the pointer returned is not necessarily directly to 83 // device virtual address space, but is platform-dependent. opaque()84 void *opaque() { return opaque_; } opaque()85 const void *opaque() const { return opaque_; } 86 87 // Returns the payload of this memory region. payload()88 uint64 payload() const { return payload_; } 89 90 // Sets payload to given value. SetPayload(uint64 payload)91 void SetPayload(uint64 payload) { payload_ = payload; } 92 93 // Returns whether the two DeviceMemoryBase segments are identical (both in 94 // their opaque pointer and size). IsSameAs(const DeviceMemoryBase & other)95 bool IsSameAs(const DeviceMemoryBase &other) const { 96 return opaque() == other.opaque() && size() == other.size(); 97 } 98 99 protected: 100 friend class StreamExecutor; 101 102 // Resets the internal values of the opaque pointer and number of bytes in the 103 // memory region, just as in the constructor. Reset(void * opaque,uint64 bytes)104 void Reset(void *opaque, uint64 bytes) { 105 opaque_ = opaque; 106 size_ = bytes; 107 } 108 109 private: 110 void *opaque_; // Platform-dependent value representing allocated memory. 111 uint64 size_; // Size in bytes of this allocation. 112 uint64 payload_ = 0; // Payload data associated with this allocation. 113 }; 114 115 // Typed wrapper around "void *"-like DeviceMemoryBase. 116 // 117 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase 118 // that represents one or more integers in Device memory. 119 // 120 // Thread-compatible. 121 template <typename ElemT> 122 class DeviceMemory final : public DeviceMemoryBase { 123 public: 124 // Default constructor instantiates a null-pointed, zero-sized memory region. DeviceMemory()125 DeviceMemory() : DeviceMemoryBase(nullptr, 0) {} DeviceMemory(std::nullptr_t)126 explicit DeviceMemory(std::nullptr_t) : DeviceMemory() {} 127 128 // Typed device memory regions may be constructed from untyped device memory 129 // regions, this effectively amounts to a cast from a void*. DeviceMemory(const DeviceMemoryBase & other)130 explicit DeviceMemory(const DeviceMemoryBase &other) 131 : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(), 132 other.size()) { 133 SetPayload(other.payload()); 134 } 135 136 // Returns the number of elements of type ElemT that constitute this 137 // allocation. ElementCount()138 uint64 ElementCount() const { return size() / sizeof(ElemT); } 139 140 // Returns whether this is a single-element allocation. IsScalar()141 bool IsScalar() const { return ElementCount() == 1; } 142 143 // Create a typed area of DeviceMemory with a given opaque pointer and the 144 // quantity of bytes in the allocation. This function is broken out to 145 // distinguish bytes from an element count. MakeFromByteSize(void * opaque,uint64 bytes)146 static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) { 147 return DeviceMemory<ElemT>(opaque, bytes); 148 } 149 150 // Resets the DeviceMemory data, in MakeFromByteSize fashion. 151 // This simply clobbers the prior values. ResetFromByteSize(void * opaque,uint64 bytes)152 void ResetFromByteSize(void *opaque, uint64 bytes) { 153 // TODO(leary) when NVCC is eliminated we can add this check (and the 154 // logging include it requires). 155 // CHECK_EQ(0, bytes % sizeof(ElemT)); 156 DeviceMemoryBase::Reset(opaque, bytes); 157 } 158 159 // ------------------------------------------------------------ 160 161 protected: 162 // This constructor is solely used from derived classes; it is made protected 163 // because it accepts a byte-size instead of an element count, which could 164 // potentially be misused given the ElementCount() nature of this interface. 165 // 166 // In order to specify the desire to use byte size instead of element count 167 // explicitly, use MakeFromByteSize. DeviceMemory(void * opaque,uint64 size)168 DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {} 169 }; 170 171 // A class to encapsulate the type and size of a dynamic shared memory 172 // buffer. Because the buffer exists solely on the device and is not copyable 173 // to the host, memory objects of this type do not maintain buffer pointers 174 // on the host. 175 template <typename ElemT> 176 class SharedDeviceMemory final : public DeviceMemoryBase { 177 public: SharedDeviceMemory(uint64 elem_count)178 explicit SharedDeviceMemory(uint64 elem_count) 179 : DeviceMemoryBase(nullptr, elem_count * kElemSize) {} 180 181 static constexpr size_t kElemSize = sizeof(ElemT); 182 183 // Returns the number of elements of type ElemT that constitute this 184 // allocation. ElementCount()185 uint64 ElementCount() const { return size() / kElemSize; } 186 187 // Returns whether this is a single-element allocation. IsScalar()188 bool IsScalar() const { return ElementCount() == 1; } 189 }; 190 191 // Host-side representation of packed-and-aligned vector datatypes on the device 192 // side. Since these can appear in device kernel signatures, we support 193 // launching them with these datatypes in launch signatures. 194 195 struct Float2 { 196 float x, y; 197 }; 198 199 struct Float4 { 200 Float2 xz, yw; 201 }; 202 203 struct Double2 { 204 double x, y; 205 }; 206 207 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed"); 208 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed"); 209 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed"); 210 211 } // namespace stream_executor 212 213 #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 214