1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
12
13 // This header file container defines fo gpu* macros which will resolve to
14 // their equivalent hip* or cuda* versions depending on the compiler in use
15 // A separate header (included at the end of this file) will undefine all
16 #include "TensorGpuHipCudaDefines.h"
17
18 namespace Eigen {
19
20 static const int kGpuScratchSize = 1024;
21
22 // This defines an interface that GPUDevice can take to use
23 // HIP / CUDA streams underneath.
24 class StreamInterface {
25 public:
~StreamInterface()26 virtual ~StreamInterface() {}
27
28 virtual const gpuStream_t& stream() const = 0;
29 virtual const gpuDeviceProp_t& deviceProperties() const = 0;
30
31 // Allocate memory on the actual device where the computation will run
32 virtual void* allocate(size_t num_bytes) const = 0;
33 virtual void deallocate(void* buffer) const = 0;
34
35 // Return a scratchpad buffer of size 1k
36 virtual void* scratchpad() const = 0;
37
38 // Return a semaphore. The semaphore is initially initialized to 0, and
39 // each kernel using it is responsible for resetting to 0 upon completion
40 // to maintain the invariant that the semaphore is always equal to 0 upon
41 // each kernel start.
42 virtual unsigned int* semaphore() const = 0;
43 };
44
45 class GpuDeviceProperties {
46 public:
GpuDeviceProperties()47 GpuDeviceProperties() :
48 initialized_(false), first_(true), device_properties_(nullptr) {}
49
~GpuDeviceProperties()50 ~GpuDeviceProperties() {
51 if (device_properties_) {
52 delete[] device_properties_;
53 }
54 }
55
get(int device)56 EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
57 return device_properties_[device];
58 }
59
isInitialized()60 EIGEN_STRONG_INLINE bool isInitialized() const {
61 return initialized_;
62 }
63
initialize()64 void initialize() {
65 if (!initialized_) {
66 // Attempts to ensure proper behavior in the case of multiple threads
67 // calling this function simultaneously. This would be trivial to
68 // implement if we could use std::mutex, but unfortunately mutex don't
69 // compile with nvcc, so we resort to atomics and thread fences instead.
70 // Note that if the caller uses a compiler that doesn't support c++11 we
71 // can't ensure that the initialization is thread safe.
72 if (first_.exchange(false)) {
73 // We're the first thread to reach this point.
74 int num_devices;
75 gpuError_t status = gpuGetDeviceCount(&num_devices);
76 if (status != gpuSuccess) {
77 std::cerr << "Failed to get the number of GPU devices: "
78 << gpuGetErrorString(status)
79 << std::endl;
80 gpu_assert(status == gpuSuccess);
81 }
82 device_properties_ = new gpuDeviceProp_t[num_devices];
83 for (int i = 0; i < num_devices; ++i) {
84 status = gpuGetDeviceProperties(&device_properties_[i], i);
85 if (status != gpuSuccess) {
86 std::cerr << "Failed to initialize GPU device #"
87 << i
88 << ": "
89 << gpuGetErrorString(status)
90 << std::endl;
91 gpu_assert(status == gpuSuccess);
92 }
93 }
94
95 std::atomic_thread_fence(std::memory_order_release);
96 initialized_ = true;
97 } else {
98 // Wait for the other thread to inititialize the properties.
99 while (!initialized_) {
100 std::atomic_thread_fence(std::memory_order_acquire);
101 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
102 }
103 }
104 }
105 }
106
107 private:
108 volatile bool initialized_;
109 std::atomic<bool> first_;
110 gpuDeviceProp_t* device_properties_;
111 };
112
GetGpuDeviceProperties()113 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
114 static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
115 if (!deviceProperties->isInitialized()) {
116 deviceProperties->initialize();
117 }
118 return *deviceProperties;
119 }
120
GetGpuDeviceProperties(int device)121 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
122 return GetGpuDeviceProperties().get(device);
123 }
124
125 static const gpuStream_t default_stream = gpuStreamDefault;
126
127 class GpuStreamDevice : public StreamInterface {
128 public:
129 // Use the default stream on the current device
GpuStreamDevice()130 GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
131 gpuGetDevice(&device_);
132 }
133 // Use the default stream on the specified device
GpuStreamDevice(int device)134 GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
135 // Use the specified stream. Note that it's the
136 // caller responsibility to ensure that the stream can run on
137 // the specified device. If no device is specified the code
138 // assumes that the stream is associated to the current gpu device.
139 GpuStreamDevice(const gpuStream_t* stream, int device = -1)
stream_(stream)140 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
141 if (device < 0) {
142 gpuGetDevice(&device_);
143 } else {
144 int num_devices;
145 gpuError_t err = gpuGetDeviceCount(&num_devices);
146 EIGEN_UNUSED_VARIABLE(err)
147 gpu_assert(err == gpuSuccess);
148 gpu_assert(device < num_devices);
149 device_ = device;
150 }
151 }
152
~GpuStreamDevice()153 virtual ~GpuStreamDevice() {
154 if (scratch_) {
155 deallocate(scratch_);
156 }
157 }
158
stream()159 const gpuStream_t& stream() const { return *stream_; }
deviceProperties()160 const gpuDeviceProp_t& deviceProperties() const {
161 return GetGpuDeviceProperties(device_);
162 }
allocate(size_t num_bytes)163 virtual void* allocate(size_t num_bytes) const {
164 gpuError_t err = gpuSetDevice(device_);
165 EIGEN_UNUSED_VARIABLE(err)
166 gpu_assert(err == gpuSuccess);
167 void* result;
168 err = gpuMalloc(&result, num_bytes);
169 gpu_assert(err == gpuSuccess);
170 gpu_assert(result != NULL);
171 return result;
172 }
deallocate(void * buffer)173 virtual void deallocate(void* buffer) const {
174 gpuError_t err = gpuSetDevice(device_);
175 EIGEN_UNUSED_VARIABLE(err)
176 gpu_assert(err == gpuSuccess);
177 gpu_assert(buffer != NULL);
178 err = gpuFree(buffer);
179 gpu_assert(err == gpuSuccess);
180 }
181
scratchpad()182 virtual void* scratchpad() const {
183 if (scratch_ == NULL) {
184 scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
185 }
186 return scratch_;
187 }
188
semaphore()189 virtual unsigned int* semaphore() const {
190 if (semaphore_ == NULL) {
191 char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
192 semaphore_ = reinterpret_cast<unsigned int*>(scratch);
193 gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
194 EIGEN_UNUSED_VARIABLE(err)
195 gpu_assert(err == gpuSuccess);
196 }
197 return semaphore_;
198 }
199
200 private:
201 const gpuStream_t* stream_;
202 int device_;
203 mutable void* scratch_;
204 mutable unsigned int* semaphore_;
205 };
206
207 struct GpuDevice {
208 // The StreamInterface is not owned: the caller is
209 // responsible for its initialization and eventual destruction.
GpuDeviceGpuDevice210 explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
211 eigen_assert(stream);
212 }
GpuDeviceGpuDevice213 explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
214 eigen_assert(stream);
215 }
216 // TODO(bsteiner): This is an internal API, we should not expose it.
streamGpuDevice217 EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
218 return stream_->stream();
219 }
220
allocateGpuDevice221 EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
222 return stream_->allocate(num_bytes);
223 }
224
deallocateGpuDevice225 EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
226 stream_->deallocate(buffer);
227 }
228
allocate_tempGpuDevice229 EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
230 return stream_->allocate(num_bytes);
231 }
232
deallocate_tempGpuDevice233 EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
234 stream_->deallocate(buffer);
235 }
236
237 template<typename Type>
getGpuDevice238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
239 return data;
240 }
241
scratchpadGpuDevice242 EIGEN_STRONG_INLINE void* scratchpad() const {
243 return stream_->scratchpad();
244 }
245
semaphoreGpuDevice246 EIGEN_STRONG_INLINE unsigned int* semaphore() const {
247 return stream_->semaphore();
248 }
249
memcpyGpuDevice250 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
251 #ifndef EIGEN_GPU_COMPILE_PHASE
252 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
253 stream_->stream());
254 EIGEN_UNUSED_VARIABLE(err)
255 gpu_assert(err == gpuSuccess);
256 #else
257 EIGEN_UNUSED_VARIABLE(dst);
258 EIGEN_UNUSED_VARIABLE(src);
259 EIGEN_UNUSED_VARIABLE(n);
260 eigen_assert(false && "The default device should be used instead to generate kernel code");
261 #endif
262 }
263
memcpyHostToDeviceGpuDevice264 EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
265 gpuError_t err =
266 gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
267 EIGEN_UNUSED_VARIABLE(err)
268 gpu_assert(err == gpuSuccess);
269 }
270
memcpyDeviceToHostGpuDevice271 EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
272 gpuError_t err =
273 gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
274 EIGEN_UNUSED_VARIABLE(err)
275 gpu_assert(err == gpuSuccess);
276 }
277
memsetGpuDevice278 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
279 #ifndef EIGEN_GPU_COMPILE_PHASE
280 gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
281 EIGEN_UNUSED_VARIABLE(err)
282 gpu_assert(err == gpuSuccess);
283 #else
284 eigen_assert(false && "The default device should be used instead to generate kernel code");
285 #endif
286 }
287
numThreadsGpuDevice288 EIGEN_STRONG_INLINE size_t numThreads() const {
289 // FIXME
290 return 32;
291 }
292
firstLevelCacheSizeGpuDevice293 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
294 // FIXME
295 return 48*1024;
296 }
297
lastLevelCacheSizeGpuDevice298 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
299 // We won't try to take advantage of the l2 cache for the time being, and
300 // there is no l3 cache on hip/cuda devices.
301 return firstLevelCacheSize();
302 }
303
synchronizeGpuDevice304 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
305 #ifndef EIGEN_GPU_COMPILE_PHASE
306 gpuError_t err = gpuStreamSynchronize(stream_->stream());
307 if (err != gpuSuccess) {
308 std::cerr << "Error detected in GPU stream: "
309 << gpuGetErrorString(err)
310 << std::endl;
311 gpu_assert(err == gpuSuccess);
312 }
313 #else
314 gpu_assert(false && "The default device should be used instead to generate kernel code");
315 #endif
316 }
317
getNumGpuMultiProcessorsGpuDevice318 EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
319 return stream_->deviceProperties().multiProcessorCount;
320 }
maxGpuThreadsPerBlockGpuDevice321 EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
322 return stream_->deviceProperties().maxThreadsPerBlock;
323 }
maxGpuThreadsPerMultiProcessorGpuDevice324 EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
325 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
326 }
sharedMemPerBlockGpuDevice327 EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
328 return stream_->deviceProperties().sharedMemPerBlock;
329 }
majorDeviceVersionGpuDevice330 EIGEN_STRONG_INLINE int majorDeviceVersion() const {
331 return stream_->deviceProperties().major;
332 }
minorDeviceVersionGpuDevice333 EIGEN_STRONG_INLINE int minorDeviceVersion() const {
334 return stream_->deviceProperties().minor;
335 }
336
maxBlocksGpuDevice337 EIGEN_STRONG_INLINE int maxBlocks() const {
338 return max_blocks_;
339 }
340
341 // This function checks if the GPU runtime recorded an error for the
342 // underlying stream device.
okGpuDevice343 inline bool ok() const {
344 #ifdef EIGEN_GPUCC
345 gpuError_t error = gpuStreamQuery(stream_->stream());
346 return (error == gpuSuccess) || (error == gpuErrorNotReady);
347 #else
348 return false;
349 #endif
350 }
351
352 private:
353 const StreamInterface* stream_;
354 int max_blocks_;
355 };
356
357 #if defined(EIGEN_HIPCC)
358
359 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
360 hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
361 gpu_assert(hipGetLastError() == hipSuccess);
362
363 #else
364
365 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
366 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
367 gpu_assert(cudaGetLastError() == cudaSuccess);
368
369 #endif
370
371 // FIXME: Should be device and kernel specific.
372 #ifdef EIGEN_GPUCC
setGpuSharedMemConfig(gpuSharedMemConfig config)373 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
374 #ifndef EIGEN_GPU_COMPILE_PHASE
375 gpuError_t status = gpuDeviceSetSharedMemConfig(config);
376 EIGEN_UNUSED_VARIABLE(status)
377 gpu_assert(status == gpuSuccess);
378 #else
379 EIGEN_UNUSED_VARIABLE(config)
380 #endif
381 }
382 #endif
383
384 } // end namespace Eigen
385
386 // undefine all the gpu* macros we defined at the beginning of the file
387 #include "TensorGpuHipCudaUndefines.h"
388
389 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
390