• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
17 #define TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
18 
19 /**
20  * Wrappers and helpers for CUDA device code.
21  *
22  * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
23  * backwards compatibility, see go/volta-porting for details.
24  * Provides atomic operations on types that aren't natively supported.
25  */
26 
27 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
28 
29 #include <algorithm>
30 #include <complex>
31 
32 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
33 #if GOOGLE_CUDA
34 #include "third_party/gpus/cuda/include/cuComplex.h"
35 #include "third_party/gpus/cuda/include/cuda.h"
36 #endif
37 #include "tensorflow/core/platform/types.h"
38 #include "tensorflow/core/util/gpu_cuda_alias.h"
39 
40 namespace tensorflow {
41 
42 // According to HIP developer guide at
43 // https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#assert
44 // assert is not supported by HIP. While we are waiting for assert support in
45 // hip kernels, the assert call should be macroed to NOP so that it does not
46 // block us from creating a debug build
47 #if TENSORFLOW_USE_ROCM
48 #undef assert
49 #define assert(x) \
50   {}
51 #endif
52 
53 namespace detail {
54 
55 // Helper for range-based for loop using 'delta' increments.
56 // Usage: see GpuGridRange?() functions below.
57 template <typename T>
58 class GpuGridRange {
59   struct Iterator {
IteratorIterator60     __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
61     __device__ T operator*() const { return index_; }
62     __device__ Iterator& operator++() {
63       index_ += delta_;
64       return *this;
65     }
66     __device__ bool operator!=(const Iterator& other) const {
67       bool greater = index_ > other.index_;
68       bool less = index_ < other.index_;
69       // Anything past an end iterator (delta_ == 0) is equal.
70       // In range-based for loops, this optimizes to 'return less'.
71       if (!other.delta_) {
72         return less;
73       }
74       if (!delta_) {
75         return greater;
76       }
77       return less || greater;
78     }
79 
80    private:
81     T index_;
82     const T delta_;
83   };
84 
85  public:
GpuGridRange(T begin,T delta,T end)86   __device__ GpuGridRange(T begin, T delta, T end)
87       : begin_(begin), delta_(delta), end_(end) {}
88 
begin()89   __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
end()90   __device__ Iterator end() const { return Iterator{end_, 0}; }
91 
92  private:
93   T begin_;
94   T delta_;
95   T end_;
96 };
97 
98 #ifndef TENSORFLOW_USE_ROCM
99 template <typename... T>
100 using CudaGridRange = GpuGridRange<T...>;
101 #endif
102 }  // namespace detail
103 
104 // Helper to visit indices in the range 0 <= i < count, using the x-coordinate
105 // of the global thread index. That is, each index i is visited by all threads
106 // with the same x-coordinate.
107 // Usage: for(int i : GpuGridRangeX(count)) { visit(i); }
108 template <typename T>
GpuGridRangeX(T count)109 __device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
110   return detail::GpuGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
111                                  gridDim.x * blockDim.x, count);
112 }
113 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeX, CudaGridRangeX);
114 
115 // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
116 // Usage: for(int i : GpuGridRangeY(count)) { visit(i); }
117 template <typename T>
GpuGridRangeY(T count)118 __device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
119   return detail::GpuGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
120                                  gridDim.y * blockDim.y, count);
121 }
122 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeY, CudaGridRangeY);
123 
124 // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
125 // Usage: for(int i : GpuGridRangeZ(count)) { visit(i); }
126 template <typename T>
GpuGridRangeZ(T count)127 __device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
128   return detail::GpuGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
129                                  gridDim.z * blockDim.z, count);
130 }
131 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeZ, CudaGridRangeZ);
132 
133 // Mask for all 32 threads in a warp.
134 __device__ const unsigned kCudaWarpAll = 0xffffffff;
135 // ROCM TODO add ROCM implementation
136 // Mask for all 64 threads in a wavefront.
137 __device__ const unsigned kGpuWarpAll = 0xffffffff;
138 
139 // Returns the warp lane ID of the calling thread
GpuLaneId()140 __device__ inline unsigned GpuLaneId() {
141   unsigned int lane_id;
142 #if GOOGLE_CUDA
143 #if __clang__
144   return __nvvm_read_ptx_sreg_laneid();
145 #else   // __clang__
146   asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
147 #endif  // __clang__
148 #elif TENSORFLOW_USE_ROCM
149   lane_id = __lane_id();
150 #endif
151   return lane_id;
152 }
153 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLaneId, CudaLaneId);
154 
155 namespace detail {
156 // Returns true if mask is a valid parameter for __shfl*sync to return a well
157 // defined value, assuming the calling lane will read from src_lane as part of
158 // the shuffle operation.
159 //
160 // Specifically, returns true iff mask has the calling lane bit and the src_lane
161 // bit set, and the src_lane calls this function with the same mask value
162 // (required for the two threads to wait for each other).
163 //
164 // On Volta, for some invalid masks, this function hangs or returns false
165 // positives, because the implementation shuffles with the same mask that
166 // we are validating. Run on Pascal if you suspect that the mask is incorrect.
GpuValidateShuffleSyncMask(unsigned mask,unsigned src_lane)167 __device__ inline bool GpuValidateShuffleSyncMask(unsigned mask,
168                                                   unsigned src_lane) {
169   unsigned src_dst_mask = 1u << GpuLaneId() | 1u << src_lane;
170 #if CUDA_VERSION >= 9000
171   unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
172 #else
173 #if GOOGLE_CUDA
174   unsigned src_lane_mask = __shfl(mask, src_lane);
175 #elif TENSORFLOW_USE_ROCM
176   unsigned src_lane_mask =
177       __shfl(static_cast<int>(mask), static_cast<int>(src_lane));
178 #endif
179 #endif
180   return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
181 }
182 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuValidateShuffleSyncMask,
183                                   CudaValidateShuffleSyncMask);
184 
185 // Returns the actual source lane for shuffle.
GpuShuffleGetSrcLane(int src_lane,int width)186 __device__ inline unsigned GpuShuffleGetSrcLane(int src_lane, int width) {
187   int lane_id = GpuLaneId();
188   int lane_base = lane_id & ~width + 1;
189   int lane_offset = src_lane & width - 1;
190   return lane_base + lane_offset;
191 }
192 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleGetSrcLane, CudaShuffleGetSrcLane);
193 
194 // Returns the source lane for shuffle up.
GpuShuffleUpGetSrcLane(unsigned delta,int width)195 __device__ inline unsigned GpuShuffleUpGetSrcLane(unsigned delta, int width) {
196   unsigned lane_id = GpuLaneId();
197   if ((lane_id & width - 1) < delta) {
198     return lane_id;
199   }
200   return lane_id - delta;
201 }
202 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpGetSrcLane,
203                                   CudaShuffleUpGetSrcLane);
204 
205 // Returns the source lane for shuffle down.
GpuShuffleDownGetSrcLane(unsigned delta,int width)206 __device__ inline unsigned GpuShuffleDownGetSrcLane(unsigned delta, int width) {
207   unsigned lane_id = GpuLaneId();
208   if ((lane_id & width - 1) + delta >= width) {
209     return lane_id;
210   }
211   return lane_id + delta;
212 }
213 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownGetSrcLane,
214                                   CudaShuffleDownGetSrcLane);
215 
216 // Returns the source lane for shuffle xor.
GpuShuffleXorGetSrcLane(int lane_mask,int width)217 __device__ inline unsigned GpuShuffleXorGetSrcLane(int lane_mask, int width) {
218   int lane_id = GpuLaneId();
219   int src_lane = lane_id ^ lane_mask;
220   if (src_lane > (lane_id | width - 1)) {
221     return lane_id;
222   }
223   return src_lane;
224 }
225 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorGetSrcLane,
226                                   CudaShuffleXorGetSrcLane);
227 }  // namespace detail
228 
229 // For all *_sync wrappers below, it is illegal to synchronize threads from
230 // different program locations, because that is not supported before sm_70.
231 // In other words, all threads in 'mask' must call the functions in convergence.
232 // Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
233 //
234 // It is also illegal to shuffle with a mask that produces an undefined result
235 // for any of the threads. Specifically, all source threads of the shuffle
236 // must have their corresponding bit in 'mask' set.
237 
238 // Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
239 __device__ inline void GpuSyncWarp(unsigned mask = kCudaWarpAll) {
240   assert(mask & 1u << GpuLaneId());
241 #if CUDA_VERSION >= 9000
242   __syncwarp(mask);
243 #endif
244 }
245 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuSyncWarp, CudaSyncWarp);
246 
247 // Wrapper for __ballot_sync. All threads in 'mask' must call this function in
248 // convergence, see comment above for details.
GpuBallotSync(unsigned mask,int pred)249 __device__ inline unsigned GpuBallotSync(unsigned mask, int pred) {
250   assert(mask & 1u << GpuLaneId());
251 #if CUDA_VERSION >= 9000
252   return __ballot_sync(mask, pred);
253 #else
254   return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
255 #endif
256 }
257 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuBallotSync, CudaBallotSync);
258 
259 // Wrapper for __any_sync. All threads in 'mask' must call this function in
260 // convergence, see comment above for details.
GpuAnySync(unsigned mask,int pred)261 __device__ inline int GpuAnySync(unsigned mask, int pred) {
262   assert(mask & 1u << GpuLaneId());
263 #if CUDA_VERSION >= 9000
264   return __any_sync(mask, pred);
265 #else
266   return __any(pred);
267 #endif
268 }
269 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAnySync, CudaAnySync);
270 
271 // Wrapper for __all_sync. All threads in 'mask' must call this function in
272 // convergence, see comment above for details.
GpuAllSync(unsigned mask,int pred)273 __device__ inline int GpuAllSync(unsigned mask, int pred) {
274   assert(mask & 1u << GpuLaneId());
275 #if CUDA_VERSION >= 9000
276   return __all_sync(mask, pred);
277 #else
278   return __all(pred);
279 #endif
280 }
281 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAllSync, CudaAllSync);
282 
283 // Wrapper for __shfl_sync. All threads in 'mask' must call this function in
284 // convergence, see comment above for details.
285 template <typename T>
286 __device__ T GpuShuffleSync(unsigned mask, T value, int src_lane,
287                             int width = warpSize) {
288   assert(!(width & width - 1));
289   assert(detail::GpuValidateShuffleSyncMask(
290       mask, detail::GpuShuffleGetSrcLane(src_lane, width)));
291 #if CUDA_VERSION >= 9000
292   return __shfl_sync(mask, value, src_lane, width);
293 #else
294   return __shfl(value, src_lane, width);
295 #endif
296 }
297 
298 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
299 // instead of float for lo and hi (which is incorrect with ftz, for example).
300 // See b/69446944.
301 __device__ inline double GpuShuffleSync(unsigned mask, double value,
302                                         int src_lane, int width = warpSize) {
303 #if GOOGLE_CUDA
304   auto tmp = __double_as_longlong(value);
305   auto lo = static_cast<unsigned>(tmp);
306   auto hi = static_cast<unsigned>(tmp >> 32);
307   hi = GpuShuffleSync(mask, hi, src_lane, width);
308   lo = GpuShuffleSync(mask, lo, src_lane, width);
309   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
310 #elif TENSORFLOW_USE_ROCM
311   auto tmp = static_cast<uint64_t>(value);
312   auto lo = static_cast<unsigned>(tmp);
313   auto hi = static_cast<unsigned>(tmp >> 32);
314   hi = __shfl(static_cast<int>(hi), src_lane, width);
315   lo = __shfl(static_cast<int>(lo), src_lane, width);
316   return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
317                              static_cast<uint64_t>(lo));
318 #endif
319 }
320 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleSync, CudaShuffleSync);
321 
322 // Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
323 // convergence, see comment above for details.
324 template <typename T>
325 __device__ inline T GpuShuffleUpSync(unsigned mask, T value, unsigned delta,
326                                      int width = warpSize) {
327   assert(!(width & width - 1));
328   assert(detail::GpuValidateShuffleSyncMask(
329       mask, detail::GpuShuffleUpGetSrcLane(delta, width)));
330 #if CUDA_VERSION >= 9000
331   return __shfl_up_sync(mask, value, delta, width);
332 #else
333   return __shfl_up(value, delta, width);
334 #endif
335 }
336 
337 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
338 // instead of float for lo and hi (which is incorrect with ftz, for example).
339 // See b/69446944.
340 __device__ inline double GpuShuffleUpSync(unsigned mask, double value,
341                                           unsigned delta,
342                                           int width = warpSize) {
343 #if GOOGLE_CUDA
344   auto tmp = __double_as_longlong(value);
345   auto lo = static_cast<unsigned>(tmp);
346   auto hi = static_cast<unsigned>(tmp >> 32);
347   hi = GpuShuffleUpSync(mask, hi, delta, width);
348   lo = GpuShuffleUpSync(mask, lo, delta, width);
349   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
350 #elif TENSORFLOW_USE_ROCM
351   auto tmp = static_cast<uint64_t>(value);
352   auto lo = static_cast<unsigned>(tmp);
353   auto hi = static_cast<unsigned>(tmp >> 32);
354   hi = __shfl_up(static_cast<int>(hi), delta, width);
355   lo = __shfl_up(static_cast<int>(lo), delta, width);
356   return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
357                              static_cast<uint64_t>(lo));
358 #endif
359 }
360 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync);
361 
362 // Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
363 // in convergence, see comment above for details.
364 template <typename T>
365 __device__ inline T GpuShuffleDownSync(unsigned mask, T value, unsigned delta,
366                                        int width = warpSize) {
367   assert(!(width & width - 1));
368   assert(detail::GpuValidateShuffleSyncMask(
369       mask, detail::GpuShuffleDownGetSrcLane(delta, width)));
370 #if CUDA_VERSION >= 9000
371   return __shfl_down_sync(mask, value, delta, width);
372 #else
373   return __shfl_down(value, delta, width);
374 #endif
375 }
376 
377 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
378 // instead of float for lo and hi (which is incorrect with ftz, for example).
379 // See b/69446944.
380 __device__ inline double GpuShuffleDownSync(unsigned mask, double value,
381                                             unsigned delta,
382                                             int width = warpSize) {
383 #if GOOGLE_CUDA
384   auto tmp = __double_as_longlong(value);
385   auto lo = static_cast<unsigned>(tmp);
386   auto hi = static_cast<unsigned>(tmp >> 32);
387   hi = GpuShuffleDownSync(mask, hi, delta, width);
388   lo = GpuShuffleDownSync(mask, lo, delta, width);
389   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
390 #elif TENSORFLOW_USE_ROCM
391   auto tmp = static_cast<uint64_t>(value);
392   auto lo = static_cast<unsigned>(tmp);
393   auto hi = static_cast<unsigned>(tmp >> 32);
394   hi = __shfl_down(static_cast<int>(hi), delta, width);
395   lo = __shfl_down(static_cast<int>(lo), delta, width);
396   return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
397                              static_cast<uint64_t>(lo));
398 #endif
399 }
400 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync);
401 
402 // Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
403 // convergence, see comment above for details.
404 template <typename T>
405 __device__ T GpuShuffleXorSync(unsigned mask, T value, int lane_mask,
406                                int width = warpSize) {
407   assert(!(width & width - 1));
408   assert(detail::GpuValidateShuffleSyncMask(
409       mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
410 #if GOOGLE_CUDA
411 #if CUDA_VERSION >= 9000
412   return __shfl_xor_sync(mask, value, lane_mask, width);
413 #else
414   return __shfl_xor(value, lane_mask, width);
415 #endif
416 #elif TENSORFLOW_USE_ROCM
417   // ROCM TODO: check if HIP should be changed to cope with more types
418   return __shfl_xor(static_cast<int>(value), lane_mask, width);
419 #endif
420 }
421 
422 #if TENSORFLOW_USE_ROCM
423 __device__ inline Eigen::half GpuShuffleXorSync(unsigned mask,
424                                                 Eigen::half value,
425                                                 int lane_mask,
426                                                 int width = warpSize) {
427   assert(!(width & width - 1));
428   assert(detail::GpuValidateShuffleSyncMask(
429       mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
430   // TODO(rocm): This doesn't preserve NaN payload and flushes denorms to zero,
431   // maybe this should be implemented differently?
432   return static_cast<Eigen::half>(
433       __shfl_xor(static_cast<float>(value), lane_mask, width));
434 }
435 #endif
436 
437 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
438 // instead of float for lo and hi (which is incorrect with ftz, for example).
439 // See b/69446944.
440 __device__ inline double GpuShuffleXorSync(unsigned mask, double value,
441                                            int lane_mask,
442                                            int width = warpSize) {
443 #if GOOGLE_CUDA
444   auto tmp = __double_as_longlong(value);
445   auto lo = static_cast<unsigned>(tmp);
446   auto hi = static_cast<unsigned>(tmp >> 32);
447   hi = GpuShuffleXorSync(mask, hi, lane_mask, width);
448   lo = GpuShuffleXorSync(mask, lo, lane_mask, width);
449   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
450 #elif TENSORFLOW_USE_ROCM
451   auto tmp = static_cast<uint64_t>(value);
452   auto lo = static_cast<unsigned>(tmp);
453   auto hi = static_cast<unsigned>(tmp >> 32);
454   hi = __shfl_xor(static_cast<int>(hi), lane_mask, width);
455   lo = __shfl_xor(static_cast<int>(lo), lane_mask, width);
456   return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
457                              static_cast<uint64_t>(lo));
458 #endif
459 }
460 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorSync, CudaShuffleXorSync);
461 
462 // Wrapper for __ldg.
463 template <typename T>
GpuLdg(const T * address)464 __host__ __device__ T GpuLdg(const T* address) {
465 #if __CUDA_ARCH__ >= 350
466   return __ldg(address);
467 #else
468   return *address;
469 #endif
470 }
471 
GpuLdg(const bool * address)472 __host__ __device__ inline bool GpuLdg(const bool* address) {
473   return GpuLdg(reinterpret_cast<const char*>(address)) != 0;
474 }
475 
GpuLdg(const std::complex<float> * address)476 __host__ __device__ inline std::complex<float> GpuLdg(
477     const std::complex<float>* address) {
478 #if __CUDA_ARCH__ >= 350
479   float2 mem = __ldg(reinterpret_cast<const float2*>(address));
480   return std::complex<float>(mem.x, mem.y);
481 #else
482   return *address;
483 #endif
484 }
485 
GpuLdg(const std::complex<double> * address)486 __host__ __device__ inline std::complex<double> GpuLdg(
487     const std::complex<double>* address) {
488 #if __CUDA_ARCH__ >= 350
489   double2 mem = __ldg(reinterpret_cast<const double2*>(address));
490   return std::complex<double>(mem.x, mem.y);
491 #else
492   return *address;
493 #endif
494 }
495 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLdg, CudaLdg);
496 
497 // Zeroes count elements starting at ptr using all threads of a 1-D grid.
498 // Note: this function does not synchronize, and therefore the memory range is
499 // not guaranteed to be zero until the next kernel launch.
500 template <typename T>
SetZero(const int count,T * __restrict__ ptr)501 __global__ void SetZero(const int count, T* __restrict__ ptr) {
502   // Check that the grid is one dimensional and index doesn't overflow.
503   assert(blockDim.y == 1);
504   assert(blockDim.z == 1);
505   assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
506   for (int i : GpuGridRangeX(count)) {
507     ptr[i] = T(0);
508   }
509 }
510 
511 // Helper to set all tensor entries to a specific value.
512 template <typename T>
SetToValue(const int count,T * __restrict__ ptr,T value)513 __global__ void SetToValue(const int count, T* __restrict__ ptr, T value) {
514   // Check that the grid is one dimensional and index doesn't overflow.
515   assert(blockDim.y == 1);
516   assert(blockDim.z == 1);
517   assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
518   for (int i : GpuGridRangeX(count)) {
519     ptr[i] = value;
520   }
521 }
522 
523 namespace detail {
524 // Helper function for atomic accumulation implemented as CAS.
525 template <typename T, typename F>
GpuAtomicCasHelper(T * ptr,F accumulate)526 __device__ T GpuAtomicCasHelper(T* ptr, F accumulate) {
527   T old = *ptr;
528   T assumed;
529   do {
530     assumed = old;
531     old = atomicCAS(ptr, assumed, accumulate(assumed));
532   } while (assumed != old);
533   return old;
534 }
535 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicCasHelper, CudaAtomicCasHelper);
536 
537 // Overload for floating point (using integer comparison to handle NaN
538 // correctly).
539 template <typename F>
GpuAtomicCasHelper(float * ptr,F accumulate)540 __device__ float GpuAtomicCasHelper(float* ptr, F accumulate) {
541   return __int_as_float(
542       GpuAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
543         return __float_as_int(accumulate(__int_as_float(a)));
544       }));
545 }
546 template <typename F>
GpuAtomicCasHelper(double * ptr,F accumulate)547 __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
548 #if TENSORFLOW_USE_ROCM
549   // FIXME: remove the workaround below once bug is fixed.
550   // HIP has a bug in the implementation of __longlong_as_double
551   // So workaround it by using reinterpret_cast<double*>.
552   uint64_t result =
553       GpuAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
554                          [accumulate](tensorflow::uint64 a) {
555                            return __double_as_longlong(
556                                accumulate(*(reinterpret_cast<double*>(&a))));
557                          });
558   return *(reinterpret_cast<double*>(&result));
559 #else
560   return __longlong_as_double(GpuAtomicCasHelper(
561       reinterpret_cast<tensorflow::uint64*>(ptr),
562       [accumulate](tensorflow::uint64 a) {
563         return __double_as_longlong(accumulate(__longlong_as_double(a)));
564       }));
565 #endif
566 }
567 
568 // Overload of above function for half. Note that we don't have
569 // atomicCAS() for anything less than 32 bits, so we need to include the
570 // other 16 bits in the operation.
571 //
572 // This version is going to be very slow
573 // under high concurrency, since most threads will be spinning on failing
574 // their compare-and-swap tests. (The fact that we get false sharing on the
575 // neighboring fp16 makes this even worse.) If you are doing a large reduction,
576 // you are much better off with doing the intermediate steps in fp32 and then
577 // switching to fp16 as late as you can in the calculations.
578 //
579 // Note: Assumes little endian.
580 template <typename F>
GpuAtomicCasHelper(Eigen::half * ptr,F accumulate)581 __device__ Eigen::half GpuAtomicCasHelper(Eigen::half* ptr, F accumulate) {
582 #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
583   static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
584 #endif
585   namespace half_impl = Eigen::half_impl;
586   intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
587   assert(!(intptr & 0x1));  // should be 2-aligned.
588   if (intptr & 0x2) {
589     // The half is in the second part of the uint32 (upper 16 bits).
590     uint32* address = reinterpret_cast<uint32*>(intptr - 2);
591     uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
592       unsigned short high = static_cast<unsigned short>(arg >> 16);
593       Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
594       return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
595     });
596     return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
597   } else {
598     // The half is in the first part of the uint32 (lower 16 bits).
599     uint32* address = reinterpret_cast<uint32*>(intptr);
600     uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
601       unsigned short low = static_cast<unsigned short>(arg & 0xffff);
602       Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
603       return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
604     });
605     return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
606   }
607 }
608 
609 template <typename From, typename To>
610 using ToTypeIfConvertible =
611     typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
612 
613 }  // namespace detail
614 
615 // CUDA provides atomic ops, but not for all types.  We provide wrappers
616 // for some ops and provide implementation for all reasonable types.
617 
618 template <typename T, typename U>
GpuAtomicAdd(T * ptr,U value)619 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
620   return atomicAdd(ptr, value);
621 }
622 
GpuAtomicAdd(Eigen::half * ptr,Eigen::half value)623 __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
624                                            Eigen::half value) {
625   return detail::GpuAtomicCasHelper(
626       ptr, [value](Eigen::half a) { return a + value; });
627 }
628 
629 #if (__CUDA_ARCH__ < 600) || TENSORFLOW_USE_ROCM
GpuAtomicAdd(double * ptr,double value)630 __device__ inline double GpuAtomicAdd(double* ptr, double value) {
631   return detail::GpuAtomicCasHelper(ptr,
632                                     [value](double a) { return a + value; });
633 }
634 #endif
635 
636 // GpuAtomicAdd
637 // Specializations of GpuAtomicAdd for complex types, which GpuAtomicAdd does
638 // not support. We treat a std::complex<T>* as a T* (the C++ standard section
639 // 26.4.4 allows this explicitly) and atomic add the real and imaginary
640 // components individually. The operation as a whole is not atomic, but we can
641 // safely treat the components independently for the purpose of accumulating.
642 
643 // ROCM TODO support GpuAtomicAdd for std::complex<>
644 #if GOOGLE_CUDA
GpuAtomicAdd(std::complex<float> * ptr,std::complex<float> value)645 __device__ inline std::complex<float> GpuAtomicAdd(std::complex<float>* ptr,
646                                                    std::complex<float> value) {
647   auto ptr_scalar = reinterpret_cast<float*>(ptr);
648   return std::complex<float>(GpuAtomicAdd(ptr_scalar, value.real()),
649                              GpuAtomicAdd(ptr_scalar + 1, value.imag()));
650 }
651 
GpuAtomicAdd(std::complex<double> * ptr,std::complex<double> value)652 __device__ inline std::complex<double> GpuAtomicAdd(
653     std::complex<double>* ptr, std::complex<double> value) {
654   auto ptr_scalar = reinterpret_cast<double*>(ptr);
655   return std::complex<double>(GpuAtomicAdd(ptr_scalar, value.real()),
656                               GpuAtomicAdd(ptr_scalar + 1, value.imag()));
657 }
658 #endif
659 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicAdd, CudaAtomicAdd);
660 
661 // GpuAtomicSub
662 template <typename T, typename U>
GpuAtomicSub(T * ptr,U value)663 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicSub(T* ptr, U value) {
664   return atomicSub(ptr, value);
665 }
666 
667 // Specializations of substraction which add the negative value.
GpuAtomicSub(float * ptr,float value)668 __device__ inline float GpuAtomicSub(float* ptr, float value) {
669   return GpuAtomicAdd(ptr, -value);
670 }
671 
GpuAtomicSub(double * ptr,double value)672 __device__ inline double GpuAtomicSub(double* ptr, double value) {
673   return GpuAtomicAdd(ptr, -value);
674 }
675 
GpuAtomicSub(tensorflow::uint64 * ptr,tensorflow::uint64 value)676 __device__ inline tensorflow::uint64 GpuAtomicSub(tensorflow::uint64* ptr,
677                                                   tensorflow::uint64 value) {
678   return GpuAtomicAdd(ptr, -value);
679 }
680 
GpuAtomicSub(Eigen::half * ptr,Eigen::half value)681 __device__ inline Eigen::half GpuAtomicSub(Eigen::half* ptr,
682                                            Eigen::half value) {
683   return detail::GpuAtomicCasHelper(
684       ptr, [value](Eigen::half a) { return a - value; });
685 }
686 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
687 
688 // GpuAtomicMax
689 template <typename T, typename U>
GpuAtomicMax(T * ptr,U value)690 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
691   return atomicMax(ptr, value);
692 }
693 
694 #if TENSORFLOW_USE_ROCM
695 
696 /*
697  * CUDA runtime headers have the following defined
698  *   __device__  int max(int, int)
699  *   __device__  float max(float, float)
700  *   __device__  double max(double, double)
701  *
702  * and many others, where as HIP runtime headers only have the "int" version
703  *
704  * Therefore need to special case ROCm version to call the correct underlying
705  * routines for float and double types.
706  *
707  */
708 
GpuAtomicMax(float * ptr,float value)709 __device__ inline float GpuAtomicMax(float* ptr, float value) {
710   return detail::GpuAtomicCasHelper(
711       ptr, [value](float a) { return fmaxf(a, value); });
712 }
713 
GpuAtomicMax(double * ptr,double value)714 __device__ inline double GpuAtomicMax(double* ptr, double value) {
715   return detail::GpuAtomicCasHelper(
716       ptr, [value](double a) { return fmax(a, value); });
717 }
718 
719 #else
720 
GpuAtomicMax(float * ptr,float value)721 __device__ inline float GpuAtomicMax(float* ptr, float value) {
722   return detail::GpuAtomicCasHelper(ptr,
723                                     [value](float a) { return max(a, value); });
724 }
725 
GpuAtomicMax(double * ptr,double value)726 __device__ inline double GpuAtomicMax(double* ptr, double value) {
727   return detail::GpuAtomicCasHelper(
728       ptr, [value](double a) { return max(a, value); });
729 }
730 
731 #endif
732 
GpuAtomicMax(Eigen::half * ptr,Eigen::half value)733 __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
734                                            Eigen::half value) {
735   return detail::GpuAtomicCasHelper(
736       ptr, [value](Eigen::half a) { return max(a, value); });
737 }
738 
739 #if __CUDA_ARCH__ < 320
GpuAtomicMax(tensorflow::uint64 * ptr,tensorflow::uint64 value)740 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
741                                                   tensorflow::uint64 value) {
742   return detail::GpuAtomicCasHelper(
743       ptr, [value](tensorflow::uint64 a) { return max(a, value); });
744 }
745 #endif
746 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
747 
748 // GpuAtomicMin
749 template <typename T, typename U>
GpuAtomicMin(T * ptr,U value)750 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
751   return atomicMin(ptr, value);
752 }
753 
754 #if TENSORFLOW_USE_ROCM
755 
756 /*
757  * CUDA runtime headers have the following defined
758  *   __device__  int min(int, int)
759  *   __device__  float min(float, float)
760  *   __device__  double min(double, double)
761  *
762  * and many others, where as HIP runtime headers only have the "int" version
763  *
764  * Therefore need to special case ROCm version to call the correct underlying
765  * routines for float and double types.
766  *
767  */
768 
GpuAtomicMin(float * ptr,float value)769 __device__ inline float GpuAtomicMin(float* ptr, float value) {
770   return detail::GpuAtomicCasHelper(
771       ptr, [value](float a) { return fminf(a, value); });
772 }
773 
GpuAtomicMin(double * ptr,double value)774 __device__ inline double GpuAtomicMin(double* ptr, double value) {
775   return detail::GpuAtomicCasHelper(
776       ptr, [value](double a) { return fmin(a, value); });
777 }
778 
779 #else
780 
GpuAtomicMin(float * ptr,float value)781 __device__ inline float GpuAtomicMin(float* ptr, float value) {
782   return detail::GpuAtomicCasHelper(ptr,
783                                     [value](float a) { return min(a, value); });
784 }
785 
GpuAtomicMin(double * ptr,double value)786 __device__ inline double GpuAtomicMin(double* ptr, double value) {
787   return detail::GpuAtomicCasHelper(
788       ptr, [value](double a) { return min(a, value); });
789 }
790 
791 #endif
792 
GpuAtomicMin(Eigen::half * ptr,Eigen::half value)793 __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
794                                            Eigen::half value) {
795   return detail::GpuAtomicCasHelper(
796       ptr, [value](Eigen::half a) { return min(a, value); });
797 }
798 
799 #if __CUDA_ARCH__ < 320
GpuAtomicMin(tensorflow::uint64 * ptr,tensorflow::uint64 value)800 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
801                                                   tensorflow::uint64 value) {
802   return detail::GpuAtomicCasHelper(
803       ptr, [value](tensorflow::uint64 a) { return min(a, value); });
804 }
805 #endif
806 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMin, CudaAtomicMin);
807 
808 // GpuAtomicMul
809 template <typename T, typename U>
GpuAtomicMul(T * ptr,U value)810 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMul(T* ptr, U value) {
811   return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a * value; });
812 }
813 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMul, CudaAtomicMul);
814 
815 // GpuAtomicDiv
816 template <typename T, typename U>
GpuAtomicDiv(T * ptr,U value)817 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicDiv(T* ptr, U value) {
818   return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a / value; });
819 }
820 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicDiv, CudaAtomicDiv);
821 
822 // Operator overloads for complex numbers.
823 #if GOOGLE_CUDA
824 __device__ inline std::complex<float> operator+(const std::complex<float>& a,
825                                                 const std::complex<float>& b) {
826   auto result = cuCaddf(make_cuComplex(a.real(), a.imag()),
827                         make_cuComplex(b.real(), b.imag()));
828   return std::complex<float>(result.x, result.y);
829 }
830 
831 __device__ inline std::complex<float> operator-(const std::complex<float>& a,
832                                                 const std::complex<float>& b) {
833   auto result = cuCsubf(make_cuComplex(a.real(), a.imag()),
834                         make_cuComplex(b.real(), b.imag()));
835   return std::complex<float>(result.x, result.y);
836 }
837 
838 __device__ inline std::complex<float> operator*(const std::complex<float>& a,
839                                                 const std::complex<float>& b) {
840   auto result = cuCmulf(make_cuComplex(a.real(), a.imag()),
841                         make_cuComplex(b.real(), b.imag()));
842   return std::complex<float>(result.x, result.y);
843 }
844 
845 __device__ inline std::complex<float> operator/(const std::complex<float>& a,
846                                                 const std::complex<float>& b) {
847   auto result = cuCdivf(make_cuComplex(a.real(), a.imag()),
848                         make_cuComplex(b.real(), b.imag()));
849   return std::complex<float>(result.x, result.y);
850 }
851 
852 __device__ inline std::complex<double> operator+(
853     const std::complex<double>& a, const std::complex<double>& b) {
854   auto result = cuCadd(make_cuDoubleComplex(a.real(), a.imag()),
855                        make_cuDoubleComplex(b.real(), b.imag()));
856   return std::complex<double>(result.x, result.y);
857 }
858 
859 __device__ inline std::complex<double> operator-(
860     const std::complex<double>& a, const std::complex<double>& b) {
861   auto result = cuCsub(make_cuDoubleComplex(a.real(), a.imag()),
862                        make_cuDoubleComplex(b.real(), b.imag()));
863   return std::complex<double>(result.x, result.y);
864 }
865 
866 __device__ inline std::complex<double> operator*(
867     const std::complex<double>& a, const std::complex<double>& b) {
868   auto result = cuCmul(make_cuDoubleComplex(a.real(), a.imag()),
869                        make_cuDoubleComplex(b.real(), b.imag()));
870   return std::complex<double>(result.x, result.y);
871 }
872 
873 __device__ inline std::complex<double> operator/(
874     const std::complex<double>& a, const std::complex<double>& b) {
875   auto result = cuCdiv(make_cuDoubleComplex(a.real(), a.imag()),
876                        make_cuDoubleComplex(b.real(), b.imag()));
877   return std::complex<double>(result.x, result.y);
878 }
879 #endif  // GOOGLE_CUDA
880 
881 }  // namespace tensorflow
882 
883 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
884 #endif  // TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
885