• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
12 
13 // This header file container defines fo gpu* macros which will resolve to
14 // their equivalent hip* or cuda* versions depending on the compiler in use
15 // A separate header (included at the end of this file) will undefine all
16 #include "TensorGpuHipCudaDefines.h"
17 
18 namespace Eigen {
19 
20 static const int kGpuScratchSize = 1024;
21 
22 // This defines an interface that GPUDevice can take to use
23 // HIP / CUDA streams underneath.
24 class StreamInterface {
25  public:
~StreamInterface()26   virtual ~StreamInterface() {}
27 
28   virtual const gpuStream_t& stream() const = 0;
29   virtual const gpuDeviceProp_t& deviceProperties() const = 0;
30 
31   // Allocate memory on the actual device where the computation will run
32   virtual void* allocate(size_t num_bytes) const = 0;
33   virtual void deallocate(void* buffer) const = 0;
34 
35   // Return a scratchpad buffer of size 1k
36   virtual void* scratchpad() const = 0;
37 
38   // Return a semaphore. The semaphore is initially initialized to 0, and
39   // each kernel using it is responsible for resetting to 0 upon completion
40   // to maintain the invariant that the semaphore is always equal to 0 upon
41   // each kernel start.
42   virtual unsigned int* semaphore() const = 0;
43 };
44 
45 class GpuDeviceProperties {
46  public:
GpuDeviceProperties()47   GpuDeviceProperties() :
48       initialized_(false), first_(true), device_properties_(nullptr) {}
49 
~GpuDeviceProperties()50   ~GpuDeviceProperties() {
51     if (device_properties_) {
52       delete[] device_properties_;
53     }
54   }
55 
get(int device)56   EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
57     return device_properties_[device];
58   }
59 
isInitialized()60   EIGEN_STRONG_INLINE bool isInitialized() const {
61     return initialized_;
62   }
63 
initialize()64   void initialize() {
65     if (!initialized_) {
66       // Attempts to ensure proper behavior in the case of multiple threads
67       // calling this function simultaneously. This would be trivial to
68       // implement if we could use std::mutex, but unfortunately mutex don't
69       // compile with nvcc, so we resort to atomics and thread fences instead.
70       // Note that if the caller uses a compiler that doesn't support c++11 we
71       // can't ensure that the initialization is thread safe.
72       if (first_.exchange(false)) {
73         // We're the first thread to reach this point.
74         int num_devices;
75         gpuError_t status = gpuGetDeviceCount(&num_devices);
76         if (status != gpuSuccess) {
77           std::cerr << "Failed to get the number of GPU devices: "
78                     << gpuGetErrorString(status)
79                     << std::endl;
80           gpu_assert(status == gpuSuccess);
81         }
82         device_properties_ = new gpuDeviceProp_t[num_devices];
83         for (int i = 0; i < num_devices; ++i) {
84           status = gpuGetDeviceProperties(&device_properties_[i], i);
85           if (status != gpuSuccess) {
86             std::cerr << "Failed to initialize GPU device #"
87                       << i
88                       << ": "
89                       << gpuGetErrorString(status)
90                       << std::endl;
91             gpu_assert(status == gpuSuccess);
92           }
93         }
94 
95         std::atomic_thread_fence(std::memory_order_release);
96         initialized_ = true;
97       } else {
98         // Wait for the other thread to inititialize the properties.
99         while (!initialized_) {
100           std::atomic_thread_fence(std::memory_order_acquire);
101           std::this_thread::sleep_for(std::chrono::milliseconds(1000));
102         }
103       }
104     }
105   }
106 
107  private:
108   volatile bool initialized_;
109   std::atomic<bool> first_;
110   gpuDeviceProp_t* device_properties_;
111 };
112 
GetGpuDeviceProperties()113 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
114   static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
115   if (!deviceProperties->isInitialized()) {
116     deviceProperties->initialize();
117   }
118   return *deviceProperties;
119 }
120 
GetGpuDeviceProperties(int device)121 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
122   return GetGpuDeviceProperties().get(device);
123 }
124 
125 static const gpuStream_t default_stream = gpuStreamDefault;
126 
127 class GpuStreamDevice : public StreamInterface {
128  public:
129   // Use the default stream on the current device
GpuStreamDevice()130   GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
131     gpuGetDevice(&device_);
132   }
133   // Use the default stream on the specified device
GpuStreamDevice(int device)134   GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
135   // Use the specified stream. Note that it's the
136   // caller responsibility to ensure that the stream can run on
137   // the specified device. If no device is specified the code
138   // assumes that the stream is associated to the current gpu device.
139   GpuStreamDevice(const gpuStream_t* stream, int device = -1)
stream_(stream)140       : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
141     if (device < 0) {
142       gpuGetDevice(&device_);
143     } else {
144       int num_devices;
145       gpuError_t err = gpuGetDeviceCount(&num_devices);
146       EIGEN_UNUSED_VARIABLE(err)
147       gpu_assert(err == gpuSuccess);
148       gpu_assert(device < num_devices);
149       device_ = device;
150     }
151   }
152 
~GpuStreamDevice()153   virtual ~GpuStreamDevice() {
154     if (scratch_) {
155       deallocate(scratch_);
156     }
157   }
158 
stream()159   const gpuStream_t& stream() const { return *stream_; }
deviceProperties()160   const gpuDeviceProp_t& deviceProperties() const {
161     return GetGpuDeviceProperties(device_);
162   }
allocate(size_t num_bytes)163   virtual void* allocate(size_t num_bytes) const {
164     gpuError_t err = gpuSetDevice(device_);
165     EIGEN_UNUSED_VARIABLE(err)
166     gpu_assert(err == gpuSuccess);
167     void* result;
168     err = gpuMalloc(&result, num_bytes);
169     gpu_assert(err == gpuSuccess);
170     gpu_assert(result != NULL);
171     return result;
172   }
deallocate(void * buffer)173   virtual void deallocate(void* buffer) const {
174     gpuError_t err = gpuSetDevice(device_);
175     EIGEN_UNUSED_VARIABLE(err)
176     gpu_assert(err == gpuSuccess);
177     gpu_assert(buffer != NULL);
178     err = gpuFree(buffer);
179     gpu_assert(err == gpuSuccess);
180   }
181 
scratchpad()182   virtual void* scratchpad() const {
183     if (scratch_ == NULL) {
184       scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
185     }
186     return scratch_;
187   }
188 
semaphore()189   virtual unsigned int* semaphore() const {
190     if (semaphore_ == NULL) {
191       char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
192       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
193       gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
194       EIGEN_UNUSED_VARIABLE(err)
195       gpu_assert(err == gpuSuccess);
196     }
197     return semaphore_;
198   }
199 
200  private:
201   const gpuStream_t* stream_;
202   int device_;
203   mutable void* scratch_;
204   mutable unsigned int* semaphore_;
205 };
206 
207 struct GpuDevice {
208   // The StreamInterface is not owned: the caller is
209   // responsible for its initialization and eventual destruction.
GpuDeviceGpuDevice210   explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
211     eigen_assert(stream);
212   }
GpuDeviceGpuDevice213   explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
214     eigen_assert(stream);
215   }
216   // TODO(bsteiner): This is an internal API, we should not expose it.
streamGpuDevice217   EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
218     return stream_->stream();
219   }
220 
allocateGpuDevice221   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
222     return stream_->allocate(num_bytes);
223   }
224 
deallocateGpuDevice225   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
226     stream_->deallocate(buffer);
227   }
228 
allocate_tempGpuDevice229   EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
230     return stream_->allocate(num_bytes);
231   }
232 
deallocate_tempGpuDevice233   EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
234     stream_->deallocate(buffer);
235   }
236 
237   template<typename Type>
getGpuDevice238   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
239     return data;
240   }
241 
scratchpadGpuDevice242   EIGEN_STRONG_INLINE void* scratchpad() const {
243     return stream_->scratchpad();
244   }
245 
semaphoreGpuDevice246   EIGEN_STRONG_INLINE unsigned int* semaphore() const {
247     return stream_->semaphore();
248   }
249 
memcpyGpuDevice250   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
251 #ifndef EIGEN_GPU_COMPILE_PHASE
252     gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
253                                       stream_->stream());
254     EIGEN_UNUSED_VARIABLE(err)
255     gpu_assert(err == gpuSuccess);
256 #else
257     EIGEN_UNUSED_VARIABLE(dst);
258     EIGEN_UNUSED_VARIABLE(src);
259     EIGEN_UNUSED_VARIABLE(n);
260     eigen_assert(false && "The default device should be used instead to generate kernel code");
261 #endif
262   }
263 
memcpyHostToDeviceGpuDevice264   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
265     gpuError_t err =
266         gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
267     EIGEN_UNUSED_VARIABLE(err)
268     gpu_assert(err == gpuSuccess);
269   }
270 
memcpyDeviceToHostGpuDevice271   EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
272     gpuError_t err =
273         gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
274     EIGEN_UNUSED_VARIABLE(err)
275     gpu_assert(err == gpuSuccess);
276   }
277 
memsetGpuDevice278   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
279 #ifndef EIGEN_GPU_COMPILE_PHASE
280     gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
281     EIGEN_UNUSED_VARIABLE(err)
282     gpu_assert(err == gpuSuccess);
283 #else
284   eigen_assert(false && "The default device should be used instead to generate kernel code");
285 #endif
286   }
287 
numThreadsGpuDevice288   EIGEN_STRONG_INLINE size_t numThreads() const {
289     // FIXME
290     return 32;
291   }
292 
firstLevelCacheSizeGpuDevice293   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
294     // FIXME
295     return 48*1024;
296   }
297 
lastLevelCacheSizeGpuDevice298   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
299     // We won't try to take advantage of the l2 cache for the time being, and
300     // there is no l3 cache on hip/cuda devices.
301     return firstLevelCacheSize();
302   }
303 
synchronizeGpuDevice304   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
305 #ifndef EIGEN_GPU_COMPILE_PHASE
306     gpuError_t err = gpuStreamSynchronize(stream_->stream());
307     if (err != gpuSuccess) {
308       std::cerr << "Error detected in GPU stream: "
309                 << gpuGetErrorString(err)
310                 << std::endl;
311       gpu_assert(err == gpuSuccess);
312     }
313 #else
314     gpu_assert(false && "The default device should be used instead to generate kernel code");
315 #endif
316   }
317 
getNumGpuMultiProcessorsGpuDevice318   EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
319     return stream_->deviceProperties().multiProcessorCount;
320   }
maxGpuThreadsPerBlockGpuDevice321   EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
322     return stream_->deviceProperties().maxThreadsPerBlock;
323   }
maxGpuThreadsPerMultiProcessorGpuDevice324   EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
325     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
326   }
sharedMemPerBlockGpuDevice327   EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
328     return stream_->deviceProperties().sharedMemPerBlock;
329   }
majorDeviceVersionGpuDevice330   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
331     return stream_->deviceProperties().major;
332   }
minorDeviceVersionGpuDevice333   EIGEN_STRONG_INLINE int minorDeviceVersion() const {
334     return stream_->deviceProperties().minor;
335   }
336 
maxBlocksGpuDevice337   EIGEN_STRONG_INLINE int maxBlocks() const {
338     return max_blocks_;
339   }
340 
341   // This function checks if the GPU runtime recorded an error for the
342   // underlying stream device.
okGpuDevice343   inline bool ok() const {
344 #ifdef EIGEN_GPUCC
345     gpuError_t error = gpuStreamQuery(stream_->stream());
346     return (error == gpuSuccess) || (error == gpuErrorNotReady);
347 #else
348     return false;
349 #endif
350   }
351 
352  private:
353   const StreamInterface* stream_;
354   int max_blocks_;
355 };
356 
357 #if defined(EIGEN_HIPCC)
358 
359 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
360   hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
361   gpu_assert(hipGetLastError() == hipSuccess);
362 
363 #else
364 
365 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
366   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
367   gpu_assert(cudaGetLastError() == cudaSuccess);
368 
369 #endif
370 
371 // FIXME: Should be device and kernel specific.
372 #ifdef EIGEN_GPUCC
setGpuSharedMemConfig(gpuSharedMemConfig config)373 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
374 #ifndef EIGEN_GPU_COMPILE_PHASE
375   gpuError_t status = gpuDeviceSetSharedMemConfig(config);
376   EIGEN_UNUSED_VARIABLE(status)
377   gpu_assert(status == gpuSuccess);
378 #else
379   EIGEN_UNUSED_VARIABLE(config)
380 #endif
381 }
382 #endif
383 
384 }  // end namespace Eigen
385 
386 // undefine all the gpu* macros we defined at the beginning of the file
387 #include "TensorGpuHipCudaUndefines.h"
388 
389 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
390