1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
17 #define TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
18
19 /**
20 * Wrappers and helpers for CUDA device code.
21 *
22 * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
23 * backwards compatibility, see go/volta-porting for details.
24 * Provides atomic operations on types that aren't natively supported.
25 */
26
27 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
28
29 #include <algorithm>
30 #include <complex>
31
32 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
33 #if GOOGLE_CUDA
34 #include "third_party/gpus/cuda/include/cuComplex.h"
35 #include "third_party/gpus/cuda/include/cuda.h"
36 #endif
37 #include "tensorflow/core/platform/types.h"
38 #include "tensorflow/core/util/gpu_cuda_alias.h"
39
40 namespace tensorflow {
41
42 // According to HIP developer guide at
43 // https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#assert
44 // assert is not supported by HIP. While we are waiting for assert support in
45 // hip kernels, the assert call should be macroed to NOP so that it does not
46 // block us from creating a debug build
47 #if TENSORFLOW_USE_ROCM
48 #undef assert
49 #define assert(x) \
50 {}
51 #endif
52
53 namespace detail {
54
55 // Helper for range-based for loop using 'delta' increments.
56 // Usage: see GpuGridRange?() functions below.
57 template <typename T>
58 class GpuGridRange {
59 struct Iterator {
IteratorIterator60 __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
61 __device__ T operator*() const { return index_; }
62 __device__ Iterator& operator++() {
63 index_ += delta_;
64 return *this;
65 }
66 __device__ bool operator!=(const Iterator& other) const {
67 bool greater = index_ > other.index_;
68 bool less = index_ < other.index_;
69 // Anything past an end iterator (delta_ == 0) is equal.
70 // In range-based for loops, this optimizes to 'return less'.
71 if (!other.delta_) {
72 return less;
73 }
74 if (!delta_) {
75 return greater;
76 }
77 return less || greater;
78 }
79
80 private:
81 T index_;
82 const T delta_;
83 };
84
85 public:
GpuGridRange(T begin,T delta,T end)86 __device__ GpuGridRange(T begin, T delta, T end)
87 : begin_(begin), delta_(delta), end_(end) {}
88
begin()89 __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
end()90 __device__ Iterator end() const { return Iterator{end_, 0}; }
91
92 private:
93 T begin_;
94 T delta_;
95 T end_;
96 };
97
98 #ifndef TENSORFLOW_USE_ROCM
99 template <typename... T>
100 using CudaGridRange = GpuGridRange<T...>;
101 #endif
102 } // namespace detail
103
104 // Helper to visit indices in the range 0 <= i < count, using the x-coordinate
105 // of the global thread index. That is, each index i is visited by all threads
106 // with the same x-coordinate.
107 // Usage: for(int i : GpuGridRangeX(count)) { visit(i); }
108 template <typename T>
GpuGridRangeX(T count)109 __device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
110 return detail::GpuGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
111 gridDim.x * blockDim.x, count);
112 }
113 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeX, CudaGridRangeX);
114
115 // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
116 // Usage: for(int i : GpuGridRangeY(count)) { visit(i); }
117 template <typename T>
GpuGridRangeY(T count)118 __device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
119 return detail::GpuGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
120 gridDim.y * blockDim.y, count);
121 }
122 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeY, CudaGridRangeY);
123
124 // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
125 // Usage: for(int i : GpuGridRangeZ(count)) { visit(i); }
126 template <typename T>
GpuGridRangeZ(T count)127 __device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
128 return detail::GpuGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
129 gridDim.z * blockDim.z, count);
130 }
131 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeZ, CudaGridRangeZ);
132
133 // Mask for all 32 threads in a warp.
134 __device__ const unsigned kCudaWarpAll = 0xffffffff;
135 // ROCM TODO add ROCM implementation
136 // Mask for all 64 threads in a wavefront.
137 __device__ const unsigned kGpuWarpAll = 0xffffffff;
138
139 // Returns the warp lane ID of the calling thread
GpuLaneId()140 __device__ inline unsigned GpuLaneId() {
141 unsigned int lane_id;
142 #if GOOGLE_CUDA
143 #if __clang__
144 return __nvvm_read_ptx_sreg_laneid();
145 #else // __clang__
146 asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
147 #endif // __clang__
148 #elif TENSORFLOW_USE_ROCM
149 lane_id = __lane_id();
150 #endif
151 return lane_id;
152 }
153 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLaneId, CudaLaneId);
154
155 namespace detail {
156 // Returns true if mask is a valid parameter for __shfl*sync to return a well
157 // defined value, assuming the calling lane will read from src_lane as part of
158 // the shuffle operation.
159 //
160 // Specifically, returns true iff mask has the calling lane bit and the src_lane
161 // bit set, and the src_lane calls this function with the same mask value
162 // (required for the two threads to wait for each other).
163 //
164 // On Volta, for some invalid masks, this function hangs or returns false
165 // positives, because the implementation shuffles with the same mask that
166 // we are validating. Run on Pascal if you suspect that the mask is incorrect.
GpuValidateShuffleSyncMask(unsigned mask,unsigned src_lane)167 __device__ inline bool GpuValidateShuffleSyncMask(unsigned mask,
168 unsigned src_lane) {
169 unsigned src_dst_mask = 1u << GpuLaneId() | 1u << src_lane;
170 #if CUDA_VERSION >= 9000
171 unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
172 #else
173 #if GOOGLE_CUDA
174 unsigned src_lane_mask = __shfl(mask, src_lane);
175 #elif TENSORFLOW_USE_ROCM
176 unsigned src_lane_mask =
177 __shfl(static_cast<int>(mask), static_cast<int>(src_lane));
178 #endif
179 #endif
180 return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
181 }
182 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuValidateShuffleSyncMask,
183 CudaValidateShuffleSyncMask);
184
185 // Returns the actual source lane for shuffle.
GpuShuffleGetSrcLane(int src_lane,int width)186 __device__ inline unsigned GpuShuffleGetSrcLane(int src_lane, int width) {
187 int lane_id = GpuLaneId();
188 int lane_base = lane_id & ~width + 1;
189 int lane_offset = src_lane & width - 1;
190 return lane_base + lane_offset;
191 }
192 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleGetSrcLane, CudaShuffleGetSrcLane);
193
194 // Returns the source lane for shuffle up.
GpuShuffleUpGetSrcLane(unsigned delta,int width)195 __device__ inline unsigned GpuShuffleUpGetSrcLane(unsigned delta, int width) {
196 unsigned lane_id = GpuLaneId();
197 if ((lane_id & width - 1) < delta) {
198 return lane_id;
199 }
200 return lane_id - delta;
201 }
202 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpGetSrcLane,
203 CudaShuffleUpGetSrcLane);
204
205 // Returns the source lane for shuffle down.
GpuShuffleDownGetSrcLane(unsigned delta,int width)206 __device__ inline unsigned GpuShuffleDownGetSrcLane(unsigned delta, int width) {
207 unsigned lane_id = GpuLaneId();
208 if ((lane_id & width - 1) + delta >= width) {
209 return lane_id;
210 }
211 return lane_id + delta;
212 }
213 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownGetSrcLane,
214 CudaShuffleDownGetSrcLane);
215
216 // Returns the source lane for shuffle xor.
GpuShuffleXorGetSrcLane(int lane_mask,int width)217 __device__ inline unsigned GpuShuffleXorGetSrcLane(int lane_mask, int width) {
218 int lane_id = GpuLaneId();
219 int src_lane = lane_id ^ lane_mask;
220 if (src_lane > (lane_id | width - 1)) {
221 return lane_id;
222 }
223 return src_lane;
224 }
225 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorGetSrcLane,
226 CudaShuffleXorGetSrcLane);
227 } // namespace detail
228
229 // For all *_sync wrappers below, it is illegal to synchronize threads from
230 // different program locations, because that is not supported before sm_70.
231 // In other words, all threads in 'mask' must call the functions in convergence.
232 // Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
233 //
234 // It is also illegal to shuffle with a mask that produces an undefined result
235 // for any of the threads. Specifically, all source threads of the shuffle
236 // must have their corresponding bit in 'mask' set.
237
238 // Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
239 __device__ inline void GpuSyncWarp(unsigned mask = kCudaWarpAll) {
240 assert(mask & 1u << GpuLaneId());
241 #if CUDA_VERSION >= 9000
242 __syncwarp(mask);
243 #endif
244 }
245 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuSyncWarp, CudaSyncWarp);
246
247 // Wrapper for __ballot_sync. All threads in 'mask' must call this function in
248 // convergence, see comment above for details.
GpuBallotSync(unsigned mask,int pred)249 __device__ inline unsigned GpuBallotSync(unsigned mask, int pred) {
250 assert(mask & 1u << GpuLaneId());
251 #if CUDA_VERSION >= 9000
252 return __ballot_sync(mask, pred);
253 #else
254 return __ballot(pred) & mask; // Apply mask to match __ballot_sync's spec.
255 #endif
256 }
257 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuBallotSync, CudaBallotSync);
258
259 // Wrapper for __any_sync. All threads in 'mask' must call this function in
260 // convergence, see comment above for details.
GpuAnySync(unsigned mask,int pred)261 __device__ inline int GpuAnySync(unsigned mask, int pred) {
262 assert(mask & 1u << GpuLaneId());
263 #if CUDA_VERSION >= 9000
264 return __any_sync(mask, pred);
265 #else
266 return __any(pred);
267 #endif
268 }
269 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAnySync, CudaAnySync);
270
271 // Wrapper for __all_sync. All threads in 'mask' must call this function in
272 // convergence, see comment above for details.
GpuAllSync(unsigned mask,int pred)273 __device__ inline int GpuAllSync(unsigned mask, int pred) {
274 assert(mask & 1u << GpuLaneId());
275 #if CUDA_VERSION >= 9000
276 return __all_sync(mask, pred);
277 #else
278 return __all(pred);
279 #endif
280 }
281 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAllSync, CudaAllSync);
282
283 // Wrapper for __shfl_sync. All threads in 'mask' must call this function in
284 // convergence, see comment above for details.
285 template <typename T>
286 __device__ T GpuShuffleSync(unsigned mask, T value, int src_lane,
287 int width = warpSize) {
288 assert(!(width & width - 1));
289 assert(detail::GpuValidateShuffleSyncMask(
290 mask, detail::GpuShuffleGetSrcLane(src_lane, width)));
291 #if CUDA_VERSION >= 9000
292 return __shfl_sync(mask, value, src_lane, width);
293 #else
294 return __shfl(value, src_lane, width);
295 #endif
296 }
297
298 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
299 // instead of float for lo and hi (which is incorrect with ftz, for example).
300 // See b/69446944.
301 __device__ inline double GpuShuffleSync(unsigned mask, double value,
302 int src_lane, int width = warpSize) {
303 #if GOOGLE_CUDA
304 auto tmp = __double_as_longlong(value);
305 auto lo = static_cast<unsigned>(tmp);
306 auto hi = static_cast<unsigned>(tmp >> 32);
307 hi = GpuShuffleSync(mask, hi, src_lane, width);
308 lo = GpuShuffleSync(mask, lo, src_lane, width);
309 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
310 #elif TENSORFLOW_USE_ROCM
311 auto tmp = static_cast<uint64_t>(value);
312 auto lo = static_cast<unsigned>(tmp);
313 auto hi = static_cast<unsigned>(tmp >> 32);
314 hi = __shfl(static_cast<int>(hi), src_lane, width);
315 lo = __shfl(static_cast<int>(lo), src_lane, width);
316 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
317 static_cast<uint64_t>(lo));
318 #endif
319 }
320 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleSync, CudaShuffleSync);
321
322 // Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
323 // convergence, see comment above for details.
324 template <typename T>
325 __device__ inline T GpuShuffleUpSync(unsigned mask, T value, unsigned delta,
326 int width = warpSize) {
327 assert(!(width & width - 1));
328 assert(detail::GpuValidateShuffleSyncMask(
329 mask, detail::GpuShuffleUpGetSrcLane(delta, width)));
330 #if CUDA_VERSION >= 9000
331 return __shfl_up_sync(mask, value, delta, width);
332 #else
333 return __shfl_up(value, delta, width);
334 #endif
335 }
336
337 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
338 // instead of float for lo and hi (which is incorrect with ftz, for example).
339 // See b/69446944.
340 __device__ inline double GpuShuffleUpSync(unsigned mask, double value,
341 unsigned delta,
342 int width = warpSize) {
343 #if GOOGLE_CUDA
344 auto tmp = __double_as_longlong(value);
345 auto lo = static_cast<unsigned>(tmp);
346 auto hi = static_cast<unsigned>(tmp >> 32);
347 hi = GpuShuffleUpSync(mask, hi, delta, width);
348 lo = GpuShuffleUpSync(mask, lo, delta, width);
349 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
350 #elif TENSORFLOW_USE_ROCM
351 auto tmp = static_cast<uint64_t>(value);
352 auto lo = static_cast<unsigned>(tmp);
353 auto hi = static_cast<unsigned>(tmp >> 32);
354 hi = __shfl_up(static_cast<int>(hi), delta, width);
355 lo = __shfl_up(static_cast<int>(lo), delta, width);
356 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
357 static_cast<uint64_t>(lo));
358 #endif
359 }
360 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync);
361
362 // Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
363 // in convergence, see comment above for details.
364 template <typename T>
365 __device__ inline T GpuShuffleDownSync(unsigned mask, T value, unsigned delta,
366 int width = warpSize) {
367 assert(!(width & width - 1));
368 assert(detail::GpuValidateShuffleSyncMask(
369 mask, detail::GpuShuffleDownGetSrcLane(delta, width)));
370 #if CUDA_VERSION >= 9000
371 return __shfl_down_sync(mask, value, delta, width);
372 #else
373 return __shfl_down(value, delta, width);
374 #endif
375 }
376
377 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
378 // instead of float for lo and hi (which is incorrect with ftz, for example).
379 // See b/69446944.
380 __device__ inline double GpuShuffleDownSync(unsigned mask, double value,
381 unsigned delta,
382 int width = warpSize) {
383 #if GOOGLE_CUDA
384 auto tmp = __double_as_longlong(value);
385 auto lo = static_cast<unsigned>(tmp);
386 auto hi = static_cast<unsigned>(tmp >> 32);
387 hi = GpuShuffleDownSync(mask, hi, delta, width);
388 lo = GpuShuffleDownSync(mask, lo, delta, width);
389 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
390 #elif TENSORFLOW_USE_ROCM
391 auto tmp = static_cast<uint64_t>(value);
392 auto lo = static_cast<unsigned>(tmp);
393 auto hi = static_cast<unsigned>(tmp >> 32);
394 hi = __shfl_down(static_cast<int>(hi), delta, width);
395 lo = __shfl_down(static_cast<int>(lo), delta, width);
396 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
397 static_cast<uint64_t>(lo));
398 #endif
399 }
400 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync);
401
402 // Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
403 // convergence, see comment above for details.
404 template <typename T>
405 __device__ T GpuShuffleXorSync(unsigned mask, T value, int lane_mask,
406 int width = warpSize) {
407 assert(!(width & width - 1));
408 assert(detail::GpuValidateShuffleSyncMask(
409 mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
410 #if GOOGLE_CUDA
411 #if CUDA_VERSION >= 9000
412 return __shfl_xor_sync(mask, value, lane_mask, width);
413 #else
414 return __shfl_xor(value, lane_mask, width);
415 #endif
416 #elif TENSORFLOW_USE_ROCM
417 // ROCM TODO: check if HIP should be changed to cope with more types
418 return __shfl_xor(static_cast<int>(value), lane_mask, width);
419 #endif
420 }
421
422 #if TENSORFLOW_USE_ROCM
423 __device__ inline Eigen::half GpuShuffleXorSync(unsigned mask,
424 Eigen::half value,
425 int lane_mask,
426 int width = warpSize) {
427 assert(!(width & width - 1));
428 assert(detail::GpuValidateShuffleSyncMask(
429 mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
430 // TODO(rocm): This doesn't preserve NaN payload and flushes denorms to zero,
431 // maybe this should be implemented differently?
432 return static_cast<Eigen::half>(
433 __shfl_xor(static_cast<float>(value), lane_mask, width));
434 }
435 #endif
436
437 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
438 // instead of float for lo and hi (which is incorrect with ftz, for example).
439 // See b/69446944.
440 __device__ inline double GpuShuffleXorSync(unsigned mask, double value,
441 int lane_mask,
442 int width = warpSize) {
443 #if GOOGLE_CUDA
444 auto tmp = __double_as_longlong(value);
445 auto lo = static_cast<unsigned>(tmp);
446 auto hi = static_cast<unsigned>(tmp >> 32);
447 hi = GpuShuffleXorSync(mask, hi, lane_mask, width);
448 lo = GpuShuffleXorSync(mask, lo, lane_mask, width);
449 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
450 #elif TENSORFLOW_USE_ROCM
451 auto tmp = static_cast<uint64_t>(value);
452 auto lo = static_cast<unsigned>(tmp);
453 auto hi = static_cast<unsigned>(tmp >> 32);
454 hi = __shfl_xor(static_cast<int>(hi), lane_mask, width);
455 lo = __shfl_xor(static_cast<int>(lo), lane_mask, width);
456 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
457 static_cast<uint64_t>(lo));
458 #endif
459 }
460 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorSync, CudaShuffleXorSync);
461
462 // Wrapper for __ldg.
463 template <typename T>
GpuLdg(const T * address)464 __host__ __device__ T GpuLdg(const T* address) {
465 #if __CUDA_ARCH__ >= 350
466 return __ldg(address);
467 #else
468 return *address;
469 #endif
470 }
471
GpuLdg(const bool * address)472 __host__ __device__ inline bool GpuLdg(const bool* address) {
473 return GpuLdg(reinterpret_cast<const char*>(address)) != 0;
474 }
475
GpuLdg(const std::complex<float> * address)476 __host__ __device__ inline std::complex<float> GpuLdg(
477 const std::complex<float>* address) {
478 #if __CUDA_ARCH__ >= 350
479 float2 mem = __ldg(reinterpret_cast<const float2*>(address));
480 return std::complex<float>(mem.x, mem.y);
481 #else
482 return *address;
483 #endif
484 }
485
GpuLdg(const std::complex<double> * address)486 __host__ __device__ inline std::complex<double> GpuLdg(
487 const std::complex<double>* address) {
488 #if __CUDA_ARCH__ >= 350
489 double2 mem = __ldg(reinterpret_cast<const double2*>(address));
490 return std::complex<double>(mem.x, mem.y);
491 #else
492 return *address;
493 #endif
494 }
495 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLdg, CudaLdg);
496
497 // Zeroes count elements starting at ptr using all threads of a 1-D grid.
498 // Note: this function does not synchronize, and therefore the memory range is
499 // not guaranteed to be zero until the next kernel launch.
500 template <typename T>
SetZero(const int count,T * __restrict__ ptr)501 __global__ void SetZero(const int count, T* __restrict__ ptr) {
502 // Check that the grid is one dimensional and index doesn't overflow.
503 assert(blockDim.y == 1);
504 assert(blockDim.z == 1);
505 assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
506 for (int i : GpuGridRangeX(count)) {
507 ptr[i] = T(0);
508 }
509 }
510
511 // Helper to set all tensor entries to a specific value.
512 template <typename T>
SetToValue(const int count,T * __restrict__ ptr,T value)513 __global__ void SetToValue(const int count, T* __restrict__ ptr, T value) {
514 // Check that the grid is one dimensional and index doesn't overflow.
515 assert(blockDim.y == 1);
516 assert(blockDim.z == 1);
517 assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
518 for (int i : GpuGridRangeX(count)) {
519 ptr[i] = value;
520 }
521 }
522
523 namespace detail {
524 // Helper function for atomic accumulation implemented as CAS.
525 template <typename T, typename F>
GpuAtomicCasHelper(T * ptr,F accumulate)526 __device__ T GpuAtomicCasHelper(T* ptr, F accumulate) {
527 T old = *ptr;
528 T assumed;
529 do {
530 assumed = old;
531 old = atomicCAS(ptr, assumed, accumulate(assumed));
532 } while (assumed != old);
533 return old;
534 }
535 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicCasHelper, CudaAtomicCasHelper);
536
537 // Overload for floating point (using integer comparison to handle NaN
538 // correctly).
539 template <typename F>
GpuAtomicCasHelper(float * ptr,F accumulate)540 __device__ float GpuAtomicCasHelper(float* ptr, F accumulate) {
541 return __int_as_float(
542 GpuAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
543 return __float_as_int(accumulate(__int_as_float(a)));
544 }));
545 }
546 template <typename F>
GpuAtomicCasHelper(double * ptr,F accumulate)547 __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
548 #if TENSORFLOW_USE_ROCM
549 // FIXME: remove the workaround below once bug is fixed.
550 // HIP has a bug in the implementation of __longlong_as_double
551 // So workaround it by using reinterpret_cast<double*>.
552 uint64_t result =
553 GpuAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
554 [accumulate](tensorflow::uint64 a) {
555 return __double_as_longlong(
556 accumulate(*(reinterpret_cast<double*>(&a))));
557 });
558 return *(reinterpret_cast<double*>(&result));
559 #else
560 return __longlong_as_double(GpuAtomicCasHelper(
561 reinterpret_cast<tensorflow::uint64*>(ptr),
562 [accumulate](tensorflow::uint64 a) {
563 return __double_as_longlong(accumulate(__longlong_as_double(a)));
564 }));
565 #endif
566 }
567
568 // Overload of above function for half. Note that we don't have
569 // atomicCAS() for anything less than 32 bits, so we need to include the
570 // other 16 bits in the operation.
571 //
572 // This version is going to be very slow
573 // under high concurrency, since most threads will be spinning on failing
574 // their compare-and-swap tests. (The fact that we get false sharing on the
575 // neighboring fp16 makes this even worse.) If you are doing a large reduction,
576 // you are much better off with doing the intermediate steps in fp32 and then
577 // switching to fp16 as late as you can in the calculations.
578 //
579 // Note: Assumes little endian.
580 template <typename F>
GpuAtomicCasHelper(Eigen::half * ptr,F accumulate)581 __device__ Eigen::half GpuAtomicCasHelper(Eigen::half* ptr, F accumulate) {
582 #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
583 static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
584 #endif
585 namespace half_impl = Eigen::half_impl;
586 intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
587 assert(!(intptr & 0x1)); // should be 2-aligned.
588 if (intptr & 0x2) {
589 // The half is in the second part of the uint32 (upper 16 bits).
590 uint32* address = reinterpret_cast<uint32*>(intptr - 2);
591 uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
592 unsigned short high = static_cast<unsigned short>(arg >> 16);
593 Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
594 return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
595 });
596 return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
597 } else {
598 // The half is in the first part of the uint32 (lower 16 bits).
599 uint32* address = reinterpret_cast<uint32*>(intptr);
600 uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
601 unsigned short low = static_cast<unsigned short>(arg & 0xffff);
602 Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
603 return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
604 });
605 return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
606 }
607 }
608
609 template <typename From, typename To>
610 using ToTypeIfConvertible =
611 typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
612
613 } // namespace detail
614
615 // CUDA provides atomic ops, but not for all types. We provide wrappers
616 // for some ops and provide implementation for all reasonable types.
617
618 template <typename T, typename U>
GpuAtomicAdd(T * ptr,U value)619 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
620 return atomicAdd(ptr, value);
621 }
622
GpuAtomicAdd(Eigen::half * ptr,Eigen::half value)623 __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
624 Eigen::half value) {
625 return detail::GpuAtomicCasHelper(
626 ptr, [value](Eigen::half a) { return a + value; });
627 }
628
629 #if (__CUDA_ARCH__ < 600) || TENSORFLOW_USE_ROCM
GpuAtomicAdd(double * ptr,double value)630 __device__ inline double GpuAtomicAdd(double* ptr, double value) {
631 return detail::GpuAtomicCasHelper(ptr,
632 [value](double a) { return a + value; });
633 }
634 #endif
635
636 // GpuAtomicAdd
637 // Specializations of GpuAtomicAdd for complex types, which GpuAtomicAdd does
638 // not support. We treat a std::complex<T>* as a T* (the C++ standard section
639 // 26.4.4 allows this explicitly) and atomic add the real and imaginary
640 // components individually. The operation as a whole is not atomic, but we can
641 // safely treat the components independently for the purpose of accumulating.
642
643 // ROCM TODO support GpuAtomicAdd for std::complex<>
644 #if GOOGLE_CUDA
GpuAtomicAdd(std::complex<float> * ptr,std::complex<float> value)645 __device__ inline std::complex<float> GpuAtomicAdd(std::complex<float>* ptr,
646 std::complex<float> value) {
647 auto ptr_scalar = reinterpret_cast<float*>(ptr);
648 return std::complex<float>(GpuAtomicAdd(ptr_scalar, value.real()),
649 GpuAtomicAdd(ptr_scalar + 1, value.imag()));
650 }
651
GpuAtomicAdd(std::complex<double> * ptr,std::complex<double> value)652 __device__ inline std::complex<double> GpuAtomicAdd(
653 std::complex<double>* ptr, std::complex<double> value) {
654 auto ptr_scalar = reinterpret_cast<double*>(ptr);
655 return std::complex<double>(GpuAtomicAdd(ptr_scalar, value.real()),
656 GpuAtomicAdd(ptr_scalar + 1, value.imag()));
657 }
658 #endif
659 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicAdd, CudaAtomicAdd);
660
661 // GpuAtomicSub
662 template <typename T, typename U>
GpuAtomicSub(T * ptr,U value)663 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicSub(T* ptr, U value) {
664 return atomicSub(ptr, value);
665 }
666
667 // Specializations of substraction which add the negative value.
GpuAtomicSub(float * ptr,float value)668 __device__ inline float GpuAtomicSub(float* ptr, float value) {
669 return GpuAtomicAdd(ptr, -value);
670 }
671
GpuAtomicSub(double * ptr,double value)672 __device__ inline double GpuAtomicSub(double* ptr, double value) {
673 return GpuAtomicAdd(ptr, -value);
674 }
675
GpuAtomicSub(tensorflow::uint64 * ptr,tensorflow::uint64 value)676 __device__ inline tensorflow::uint64 GpuAtomicSub(tensorflow::uint64* ptr,
677 tensorflow::uint64 value) {
678 return GpuAtomicAdd(ptr, -value);
679 }
680
GpuAtomicSub(Eigen::half * ptr,Eigen::half value)681 __device__ inline Eigen::half GpuAtomicSub(Eigen::half* ptr,
682 Eigen::half value) {
683 return detail::GpuAtomicCasHelper(
684 ptr, [value](Eigen::half a) { return a - value; });
685 }
686 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
687
688 // GpuAtomicMax
689 template <typename T, typename U>
GpuAtomicMax(T * ptr,U value)690 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
691 return atomicMax(ptr, value);
692 }
693
694 #if TENSORFLOW_USE_ROCM
695
696 /*
697 * CUDA runtime headers have the following defined
698 * __device__ int max(int, int)
699 * __device__ float max(float, float)
700 * __device__ double max(double, double)
701 *
702 * and many others, where as HIP runtime headers only have the "int" version
703 *
704 * Therefore need to special case ROCm version to call the correct underlying
705 * routines for float and double types.
706 *
707 */
708
GpuAtomicMax(float * ptr,float value)709 __device__ inline float GpuAtomicMax(float* ptr, float value) {
710 return detail::GpuAtomicCasHelper(
711 ptr, [value](float a) { return fmaxf(a, value); });
712 }
713
GpuAtomicMax(double * ptr,double value)714 __device__ inline double GpuAtomicMax(double* ptr, double value) {
715 return detail::GpuAtomicCasHelper(
716 ptr, [value](double a) { return fmax(a, value); });
717 }
718
719 #else
720
GpuAtomicMax(float * ptr,float value)721 __device__ inline float GpuAtomicMax(float* ptr, float value) {
722 return detail::GpuAtomicCasHelper(ptr,
723 [value](float a) { return max(a, value); });
724 }
725
GpuAtomicMax(double * ptr,double value)726 __device__ inline double GpuAtomicMax(double* ptr, double value) {
727 return detail::GpuAtomicCasHelper(
728 ptr, [value](double a) { return max(a, value); });
729 }
730
731 #endif
732
GpuAtomicMax(Eigen::half * ptr,Eigen::half value)733 __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
734 Eigen::half value) {
735 return detail::GpuAtomicCasHelper(
736 ptr, [value](Eigen::half a) { return max(a, value); });
737 }
738
739 #if __CUDA_ARCH__ < 320
GpuAtomicMax(tensorflow::uint64 * ptr,tensorflow::uint64 value)740 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
741 tensorflow::uint64 value) {
742 return detail::GpuAtomicCasHelper(
743 ptr, [value](tensorflow::uint64 a) { return max(a, value); });
744 }
745 #endif
746 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
747
748 // GpuAtomicMin
749 template <typename T, typename U>
GpuAtomicMin(T * ptr,U value)750 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
751 return atomicMin(ptr, value);
752 }
753
754 #if TENSORFLOW_USE_ROCM
755
756 /*
757 * CUDA runtime headers have the following defined
758 * __device__ int min(int, int)
759 * __device__ float min(float, float)
760 * __device__ double min(double, double)
761 *
762 * and many others, where as HIP runtime headers only have the "int" version
763 *
764 * Therefore need to special case ROCm version to call the correct underlying
765 * routines for float and double types.
766 *
767 */
768
GpuAtomicMin(float * ptr,float value)769 __device__ inline float GpuAtomicMin(float* ptr, float value) {
770 return detail::GpuAtomicCasHelper(
771 ptr, [value](float a) { return fminf(a, value); });
772 }
773
GpuAtomicMin(double * ptr,double value)774 __device__ inline double GpuAtomicMin(double* ptr, double value) {
775 return detail::GpuAtomicCasHelper(
776 ptr, [value](double a) { return fmin(a, value); });
777 }
778
779 #else
780
GpuAtomicMin(float * ptr,float value)781 __device__ inline float GpuAtomicMin(float* ptr, float value) {
782 return detail::GpuAtomicCasHelper(ptr,
783 [value](float a) { return min(a, value); });
784 }
785
GpuAtomicMin(double * ptr,double value)786 __device__ inline double GpuAtomicMin(double* ptr, double value) {
787 return detail::GpuAtomicCasHelper(
788 ptr, [value](double a) { return min(a, value); });
789 }
790
791 #endif
792
GpuAtomicMin(Eigen::half * ptr,Eigen::half value)793 __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
794 Eigen::half value) {
795 return detail::GpuAtomicCasHelper(
796 ptr, [value](Eigen::half a) { return min(a, value); });
797 }
798
799 #if __CUDA_ARCH__ < 320
GpuAtomicMin(tensorflow::uint64 * ptr,tensorflow::uint64 value)800 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
801 tensorflow::uint64 value) {
802 return detail::GpuAtomicCasHelper(
803 ptr, [value](tensorflow::uint64 a) { return min(a, value); });
804 }
805 #endif
806 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMin, CudaAtomicMin);
807
808 // GpuAtomicMul
809 template <typename T, typename U>
GpuAtomicMul(T * ptr,U value)810 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMul(T* ptr, U value) {
811 return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a * value; });
812 }
813 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMul, CudaAtomicMul);
814
815 // GpuAtomicDiv
816 template <typename T, typename U>
GpuAtomicDiv(T * ptr,U value)817 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicDiv(T* ptr, U value) {
818 return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a / value; });
819 }
820 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicDiv, CudaAtomicDiv);
821
822 // Operator overloads for complex numbers.
823 #if GOOGLE_CUDA
824 __device__ inline std::complex<float> operator+(const std::complex<float>& a,
825 const std::complex<float>& b) {
826 auto result = cuCaddf(make_cuComplex(a.real(), a.imag()),
827 make_cuComplex(b.real(), b.imag()));
828 return std::complex<float>(result.x, result.y);
829 }
830
831 __device__ inline std::complex<float> operator-(const std::complex<float>& a,
832 const std::complex<float>& b) {
833 auto result = cuCsubf(make_cuComplex(a.real(), a.imag()),
834 make_cuComplex(b.real(), b.imag()));
835 return std::complex<float>(result.x, result.y);
836 }
837
838 __device__ inline std::complex<float> operator*(const std::complex<float>& a,
839 const std::complex<float>& b) {
840 auto result = cuCmulf(make_cuComplex(a.real(), a.imag()),
841 make_cuComplex(b.real(), b.imag()));
842 return std::complex<float>(result.x, result.y);
843 }
844
845 __device__ inline std::complex<float> operator/(const std::complex<float>& a,
846 const std::complex<float>& b) {
847 auto result = cuCdivf(make_cuComplex(a.real(), a.imag()),
848 make_cuComplex(b.real(), b.imag()));
849 return std::complex<float>(result.x, result.y);
850 }
851
852 __device__ inline std::complex<double> operator+(
853 const std::complex<double>& a, const std::complex<double>& b) {
854 auto result = cuCadd(make_cuDoubleComplex(a.real(), a.imag()),
855 make_cuDoubleComplex(b.real(), b.imag()));
856 return std::complex<double>(result.x, result.y);
857 }
858
859 __device__ inline std::complex<double> operator-(
860 const std::complex<double>& a, const std::complex<double>& b) {
861 auto result = cuCsub(make_cuDoubleComplex(a.real(), a.imag()),
862 make_cuDoubleComplex(b.real(), b.imag()));
863 return std::complex<double>(result.x, result.y);
864 }
865
866 __device__ inline std::complex<double> operator*(
867 const std::complex<double>& a, const std::complex<double>& b) {
868 auto result = cuCmul(make_cuDoubleComplex(a.real(), a.imag()),
869 make_cuDoubleComplex(b.real(), b.imag()));
870 return std::complex<double>(result.x, result.y);
871 }
872
873 __device__ inline std::complex<double> operator/(
874 const std::complex<double>& a, const std::complex<double>& b) {
875 auto result = cuCdiv(make_cuDoubleComplex(a.real(), a.imag()),
876 make_cuDoubleComplex(b.real(), b.imag()));
877 return std::complex<double>(result.x, result.y);
878 }
879 #endif // GOOGLE_CUDA
880
881 } // namespace tensorflow
882
883 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
884 #endif // TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
885