• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
17 #define TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
18 
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20 
21 #include <type_traits>
22 
23 #if GOOGLE_CUDA
24 #include "third_party/gpus/cuda/include/cuda_fp16.h"
25 #endif
26 #include "tensorflow/core/util/gpu_cuda_alias.h"
27 #include "tensorflow/core/util/gpu_device_functions.h"
28 #include "tensorflow/core/util/gpu_launch_config.h"
29 
30 #if GOOGLE_CUDA
31 #define TF_RED_WARPSIZE 32
32 #elif TENSORFLOW_USE_ROCM
33 #define TF_RED_WARPSIZE 64
34 #endif
35 
36 // Deprecated, use 'for(int i : GpuGridRangeX(n))' instead.
37 #define GPU_1D_KERNEL_LOOP(i, n) \
38   for (int i : ::tensorflow::GpuGridRangeX<int>(n))
39 #define CUDA_1D_KERNEL_LOOP(i, n) \
40   for (int i : ::tensorflow::GpuGridRangeX<int>(n))
41 
42 // Deprecated, use 'for(int i : GpuGridRange?(n))' instead.
43 #define GPU_AXIS_KERNEL_LOOP(i, n, axis) \
44   for (int i : ::tensorflow::GpuGridRange##axis<int>(n))
45 #define CUDA_AXIS_KERNEL_LOOP(i, n, axis) \
46   for (int i : ::tensorflow::GpuGridRange##axis<int>(n))
47 
48 #if GOOGLE_CUDA
49 #define gpuSuccess cudaSuccess
50 using gpuStream_t = cudaStream_t;
51 using gpuError_t = cudaError_t;
52 #elif TENSORFLOW_USE_ROCM
53 #define gpuSuccess hipSuccess
54 using gpuStream_t = hipStream_t;
55 using gpuError_t = hipError_t;
56 #endif
57 
58 // macro wrapper to declare dynamic shared memory
59 #if GOOGLE_CUDA
60 
61 #define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
62   extern __shared__ __align__(ALIGN) TYPE NAME[]
63 
64 #elif TENSORFLOW_USE_ROCM
65 
66 #define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
67   HIP_DYNAMIC_SHARED(TYPE, NAME)
68 
69 #endif
70 
71 namespace tensorflow {
72 
73 #if GOOGLE_CUDA
74 // cudaGetErrorString is available to both host and device
GpuGetErrorString(cudaError_t error)75 __host__ __device__ inline const char* GpuGetErrorString(cudaError_t error) {
76   return cudaGetErrorString(error);
77 }
78 #elif TENSORFLOW_USE_ROCM
79 // hipGetErrorString is available on host side only
80 inline const char* GpuGetErrorString(hipError_t error) {
81   return hipGetErrorString(error);
82 }
83 #endif
84 
85 // Returns a raw reference to the current cuda stream. Required by a
86 // number of kernel calls (for which StreamInterface* does not work),
87 // i.e. CUB and certain cublas primitives.
GetGpuStream(OpKernelContext * context)88 inline const gpuStream_t& GetGpuStream(OpKernelContext* context) {
89   const gpuStream_t* ptr = CHECK_NOTNULL(
90       reinterpret_cast<const gpuStream_t*>(context->op_device_context()
91                                                ->stream()
92                                                ->implementation()
93                                                ->GpuStreamMemberHack()));
94   return *ptr;
95 }
96 
97 // Launches a GPU kernel through cudaLaunchKernel in CUDA environment, or
98 // hipLaunchKernel in ROCm environment with the given arguments.
99 //
100 // The kernel parameters 'Ts' must be constructible from the arguments 'Args'.
101 template <typename... Ts, typename... Args>
GpuLaunchKernel(void (* function)(Ts...),dim3 grid_dim,dim3 block_dim,size_t shared_memory_size_bytes,gpuStream_t stream,Args...arguments)102 Status GpuLaunchKernel(void (*function)(Ts...), dim3 grid_dim, dim3 block_dim,
103                        size_t shared_memory_size_bytes, gpuStream_t stream,
104                        Args... arguments) {
105   static_assert(detail::NoneIsReference<Ts...>(),
106                 "Kernels with reference arguments have undefined behaviour.");
107 #if GOOGLE_CUDA
108   auto func_ptr = absl::bit_cast<const void*>(function);
109   // Cast arguments and forward them as an array of pointers.
110   auto args_tuple = std::tuple<Ts...>(arguments...);
111   auto arg_ptrs = detail::GetArrayOfElementPointers(&args_tuple);
112   auto result = cudaLaunchKernel(func_ptr, grid_dim, block_dim, arg_ptrs.data(),
113                                  shared_memory_size_bytes, stream);
114   if (result != cudaSuccess) {
115     return errors::Internal(cudaGetErrorString(result));
116   }
117 #elif TENSORFLOW_USE_ROCM
118   hipLaunchKernelGGL(function, grid_dim, block_dim, shared_memory_size_bytes,
119                      stream, std::forward<Args>(arguments)...);
120 #endif
121   return Status::OK();
122 }
123 
124 // Perfect forwarding to make CudaLaunchKernel available to both ROCm and CUDA
125 // builds
126 template <typename... Args>
127 auto CudaLaunchKernel(Args&&... args)
128     -> decltype(GpuLaunchKernel(std::forward<Args>(args)...)) {
129   return GpuLaunchKernel(std::forward<Args>(args)...);
130 }
131 
GpuLdg(const tensorflow::bfloat16 * address)132 __host__ __device__ inline tensorflow::bfloat16 GpuLdg(
133     const tensorflow::bfloat16* address) {
134   return Eigen::numext::bit_cast<tensorflow::bfloat16>(
135       GpuLdg(reinterpret_cast<const uint16_t*>(address)));
136 }
137 // Already aliased in gpu_device_functions.h
138 
139 template <typename T>
ldg(const T * ptr)140 __host__ __device__ inline T ldg(const T* ptr) {
141   return GpuLdg(ptr);
142 }
143 
144 template <typename T>
tf_min(const T & x,const T & y)145 __host__ __device__ inline const T& tf_min(const T& x, const T& y) {
146   return x < y ? x : y;
147 }
148 
149 template <typename T>
tf_max(const T & x,const T & y)150 __host__ __device__ inline const T& tf_max(const T& x, const T& y) {
151   return x < y ? y : x;
152 }
153 
154 // Overloads of the above functions for float and double.
tf_min(float x,float y)155 __host__ __device__ inline float tf_min(float x, float y) {
156   return fminf(x, y);
157 }
tf_min(double x,double y)158 __host__ __device__ inline double tf_min(double x, double y) {
159   return fmin(x, y);
160 }
tf_max(float x,float y)161 __host__ __device__ inline float tf_max(float x, float y) {
162   return fmaxf(x, y);
163 }
tf_max(double x,double y)164 __host__ __device__ inline double tf_max(double x, double y) {
165   return fmax(x, y);
166 }
167 
168 // ROCM TODO re-enable them after adding fp16 support logic
169 #if GOOGLE_CUDA
170 __device__ inline Eigen::half GpuShuffleSync(unsigned mask, Eigen::half value,
171                                              int src_lane,
172                                              int width = warpSize) {
173   return Eigen::half(
174       GpuShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
175 }
176 // Aliased in gpu_device_functions.h
177 
178 __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleUpSync(
179     unsigned mask, Eigen::half value, int delta, int width = warpSize) {
180   return Eigen::half(
181       GpuShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
182 }
183 // Aliased in gpu_device_functions.h
184 
185 __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleDownSync(
186     unsigned mask, Eigen::half value, int delta, int width = warpSize) {
187   return Eigen::half(
188       GpuShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
189 }
190 // Aliased in gpu_device_functions.h
191 
192 __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleXorSync(
193     unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
194   return Eigen::half(
195       GpuShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
196 }
197 // Aliased in gpu_device_functions.h
198 #endif
199 
200 #ifdef __CUDA_ARCH__
201 #define UNROLL_ON_DEVICE _Pragma("unroll")
202 #else
203 #define UNROLL_ON_DEVICE
204 #endif
205 
206 // Represents an aligned array of N elements of T. Data pointers can be
207 // reinterpreted as this type to generate vectorized loads/stores in a kernel.
208 template <typename T, int N>
209 class alignas(alignof(T) * N) AlignedVector {
210  public:
211   typedef T value_type;
212   static constexpr const int kSize = N;
213 
214   AlignedVector() = default;
215 
216   // Uniform initialization.
AlignedVector(value_type uniform)217   __host__ __device__ explicit AlignedVector(value_type uniform) {
218     UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] = uniform; }
219   }
220   // Uniform initialization with explicit conversion.
221   // Note: This is required for T=Eigen::half because it only supports explicit
222   // conversions from other types and its template constructor is too relaxed
223   // to be able to use std::is_constructible.
224   template <typename U, typename std::enable_if<std::is_arithmetic<U>::value,
225                                                 int>::type = 0>
AlignedVector(U uniform_u)226   __host__ __device__ explicit AlignedVector(U uniform_u) {
227     value_type uniform(uniform_u);
228     UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] = uniform; }
229   }
230   // Implicit conversion.
231   template <typename U, typename std::enable_if<
232                             std::is_convertible<U, T>::value, int>::type = 0>
AlignedVector(const AlignedVector<U,N> & other)233   __host__ __device__ AlignedVector(const AlignedVector<U, N>& other) {
234     UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] = other[i]; }
235   }
236   // Explicit conversion.
237   template <typename U,
238             typename std::enable_if<!std::is_convertible<U, T>::value &&
239                                         std::is_constructible<T, U>::value,
240                                     int>::type = 0>
AlignedVector(const AlignedVector<U,N> & other)241   __host__ __device__ explicit AlignedVector(const AlignedVector<U, N>& other) {
242     UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) {
243       values_[i] = T(other[i]);
244     }
245   }
246 
247   __host__ __device__ value_type& operator[](int i) { return values_[i]; }
248   __host__ __device__ const value_type& operator[](int i) const {
249     return values_[i];
250   }
251 
252 #define DEFINE_BINARY_UPDATE_OPERATOR(op)                                      \
253   __host__ __device__ AlignedVector& operator op(const AlignedVector& rhs) {   \
254     UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] op rhs[i]; } \
255     return *this;                                                              \
256   }
257   DEFINE_BINARY_UPDATE_OPERATOR(+=)
258   DEFINE_BINARY_UPDATE_OPERATOR(-=)
259   DEFINE_BINARY_UPDATE_OPERATOR(*=)
260   DEFINE_BINARY_UPDATE_OPERATOR(/=)
261 #undef DEFINE_BINARY_UPDATE_OPERATOR
262 
263 #define DEFINE_BINARY_OPERATOR(op)                          \
264   friend __host__ __device__ AlignedVector operator op(     \
265       const AlignedVector& lhs, const AlignedVector& rhs) { \
266     AlignedVector ret;                                      \
267     UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) {      \
268       ret[i] = lhs[i] op rhs[i];                            \
269     }                                                       \
270     return ret;                                             \
271   }
272   DEFINE_BINARY_OPERATOR(+)
273   DEFINE_BINARY_OPERATOR(-)
274   DEFINE_BINARY_OPERATOR(*)
275   DEFINE_BINARY_OPERATOR(/)
276 #undef DEFINE_BINARY_OPERATOR
277 
278  private:
279   value_type values_[N];
280 };
281 
282 #undef UNROLL_ON_DEVICE
283 
284 // Returns the maximum power-of-two alignment (in units of elements, not bytes)
285 // of a stride or pointer value.
alignment_of(int64_t element_stride)286 inline int64 alignment_of(int64_t element_stride) {
287   return element_stride & -element_stride;
288 }
289 
290 template <typename T>
alignment_of(T * ptr)291 inline int64 alignment_of(T* ptr) {
292   const intptr_t ptr_val = reinterpret_cast<std::uintptr_t>(ptr);
293   // Pointers should always be aligned to sizeof(T) bytes.
294   DCHECK_EQ(ptr_val % sizeof(T), 0);
295   // Note that we want the alignment in elements, not bytes.
296   return alignment_of(ptr_val / sizeof(T));
297 }
298 
299 template <typename... Args>
MinAlignmentOf(Args...args)300 int64 MinAlignmentOf(Args... args) {
301   return std::min({alignment_of(args)...});
302 }
303 
304 // Calls Functor<vec_size>()(args...) with vec_size set to the optimal GPU
305 // vector instruction size for type T that is <= max_vec_size. The max_vec_size
306 // argument should be set to the minimum alignment of all relevant parameters.
307 template <typename T, template <int vec_size> class Functor, typename... Args>
DispatchToVectorized(int64_t max_vec_size,Args &&...args)308 Status DispatchToVectorized(int64_t max_vec_size, Args&&... args) {
309   constexpr const int kOptimalVecSizeBytes = 16;
310   // The optimal number of (aligned) elements of T to load/store in a
311   // single instruction inside a kernel.
312   constexpr const int optimal_vec_size =
313       (kOptimalVecSizeBytes - 1) / sizeof(T) + 1;
314   int64_t vec_size = std::min((int64)optimal_vec_size, max_vec_size);
315   if (vec_size >= 16) {
316     return Functor<16>()(std::forward<Args>(args)...);
317   } else if (vec_size >= 8) {
318     return Functor<8>()(std::forward<Args>(args)...);
319   } else if (vec_size >= 4) {
320     return Functor<4>()(std::forward<Args>(args)...);
321   } else if (vec_size >= 2) {
322     return Functor<2>()(std::forward<Args>(args)...);
323   } else {
324     return Functor<1>()(std::forward<Args>(args)...);
325   }
326 }
327 
328 namespace gpu_helper {
329 template <typename T, typename OutType = int32>
upper_bound(const T * first,OutType count,T val)330 __device__ OutType upper_bound(const T* first, OutType count, T val) {
331   const T* orig = first;
332   const T* it = nullptr;
333   OutType step = 0;
334   while (count > 0) {
335     it = first;
336     step = count / 2;
337     it += step;
338     if (!(val < *it)) {
339       first = ++it;
340       count -= step + 1;
341     } else {
342       count = step;
343     }
344   }
345 
346   return first - orig;
347 }
348 
349 template <typename T, typename OutType = int32>
lower_bound(const T * first,OutType count,T val)350 __device__ OutType lower_bound(const T* first, OutType count, T val) {
351   const T* orig = first;
352   const T* it = nullptr;
353   OutType step = 0;
354   while (count > 0) {
355     it = first;
356     step = count / 2;
357     it += step;
358     if (*it < val) {
359       first = ++it;
360       count -= step + 1;
361     } else {
362       count = step;
363     }
364   }
365 
366   return first - orig;
367 }
368 
369 }  // namespace gpu_helper
370 
371 #ifndef TENSORFLOW_USE_ROCM
372 namespace cuda_helper = gpu_helper;
373 #endif
374 
375 }  // namespace tensorflow
376 
377 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
378 #endif  // TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
379