1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
17 #define TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
18
19 /**
20 * Wrappers and helpers for CUDA device code.
21 *
22 * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
23 * backwards compatibility, see go/volta-porting for details.
24 * Provides atomic operations on types that aren't natively supported.
25 * Defines a number of macros and types providing a shared interface
26 * to either CUDA or ROCm APIs, depending on the build.
27 */
28
29 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
30
31 #include <algorithm>
32 #include <complex>
33
34 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
35 #if GOOGLE_CUDA
36 #include "third_party/gpus/cuda/include/cuComplex.h"
37 #include "third_party/gpus/cuda/include/cuda.h"
38 #else
39 #include "rocm/include/hip/hip_complex.h"
40 #endif
41
42 #include "tensorflow/core/platform/types.h"
43 #include "tensorflow/core/util/gpu_cuda_alias.h"
44
45 #if GOOGLE_CUDA
46 using gpuFloatComplex = cuFloatComplex;
47 using gpuDoubleComplex = cuDoubleComplex;
48 using gpuStream_t = cudaStream_t;
49 using gpuEvent_t = cudaEvent_t;
50 #define gpuEventRecord cudaEventRecord
51 #define gpuEventSynchronize cudaEventSynchronize
52 #define gpuEventDestroy cudaEventDestroy
53 #define gpuEventCreate cudaEventCreate
54 #define gpuEventCreateWithFlags cudaEventCreateWithFlags
55 #define gpuEventDisableTiming cudaEventDisableTiming
56 #define gpuDeviceSynchronize cudaDeviceSynchronize
57 #define gpuFree cudaFree
58 #elif TENSORFLOW_USE_ROCM
59 using gpuFloatComplex = hipFloatComplex;
60 using gpuDoubleComplex = hipDoubleComplex;
61 using gpuStream_t = hipStream_t;
62 using gpuEvent_t = hipEvent_t;
63 using cudaError = int;
64 using cudaError_t = int;
65 #define cudaSuccess 0
66 #define cudaGetLastError hipGetLastError
67 #define gpuEventRecord hipEventRecord
68 #define gpuEventDestroy hipEventDestroy
69 #define gpuEventSynchronize hipEventSynchronize
70 #define gpuEventCreate hipEventCreate
71 #define gpuEventCreateWithFlags hipEventCreateWithFlags
72 #define gpuEventDisableTiming hipEventDisableTiming
73 #define gpuDeviceSynchronize hipDeviceSynchronize
74 #define gpuFree hipFree
cudaGetErrorString(int err)75 static std::string cudaGetErrorString(int err) { return std::to_string(err); }
76 #endif
77
78 #define TF_RETURN_IF_CUDA_ERROR(result) \
79 do { \
80 cudaError_t error(result); \
81 if (!SE_PREDICT_TRUE(error == cudaSuccess)) { \
82 return errors::Internal("Cuda call failed with ", \
83 cudaGetErrorString(error)); \
84 } \
85 } while (0)
86
87 #define TF_OP_REQUIRES_CUDA_SUCCESS(context, result) \
88 do { \
89 cudaError_t error(result); \
90 if (!SE_PREDICT_TRUE(error == cudaSuccess)) { \
91 context->SetStatus(errors::Internal("Cuda call failed with", \
92 cudaGetErrorString(error))); \
93 return; \
94 } \
95 } while (0)
96
97 namespace tensorflow {
98 // According to HIP developer guide at
99 // https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#assert
100 // assert is not supported by HIP. While we are waiting for assert support in
101 // hip kernels, the assert call should be macroed to NOP so that it does not
102 // block us from creating a debug build
103 #if TENSORFLOW_USE_ROCM
104 #undef assert
105 #define assert(x) \
106 {}
107 #endif
108
109 namespace detail {
110
111 // Helper for range-based for loop using 'delta' increments.
112 // Usage: see GpuGridRange?() functions below.
113 template <typename T>
114 class GpuGridRange {
115 struct Iterator {
IteratorIterator116 __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
117 __device__ T operator*() const { return index_; }
118 __device__ Iterator& operator++() {
119 index_ += delta_;
120 return *this;
121 }
122 __device__ bool operator!=(const Iterator& other) const {
123 bool greater = index_ > other.index_;
124 bool less = index_ < other.index_;
125 // Anything past an end iterator (delta_ == 0) is equal.
126 // In range-based for loops, this optimizes to 'return less'.
127 if (!other.delta_) {
128 return less;
129 }
130 if (!delta_) {
131 return greater;
132 }
133 return less || greater;
134 }
135
136 private:
137 T index_;
138 const T delta_;
139 };
140
141 public:
GpuGridRange(T begin,T delta,T end)142 __device__ GpuGridRange(T begin, T delta, T end)
143 : begin_(begin), delta_(delta), end_(end) {}
144
begin()145 __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
end()146 __device__ Iterator end() const { return Iterator{end_, 0}; }
147
148 private:
149 T begin_;
150 T delta_;
151 T end_;
152 };
153
154 #ifndef TENSORFLOW_USE_ROCM
155 template <typename... T>
156 using CudaGridRange = GpuGridRange<T...>;
157 #endif
158 } // namespace detail
159
160 // Helper to visit indices in the range 0 <= i < count, using the x-coordinate
161 // of the global thread index. That is, each index i is visited by all threads
162 // with the same x-coordinate.
163 // Usage: for(int i : GpuGridRangeX(count)) { visit(i); }
164 template <typename T>
GpuGridRangeX(T count)165 __device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
166 return detail::GpuGridRange<T>(
167 /*begin=*/blockIdx.x * blockDim.x + threadIdx.x,
168 /*delta=*/gridDim.x * blockDim.x, /*end=*/count);
169 }
170 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeX, CudaGridRangeX);
171
172 // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
173 // Usage: for(int i : GpuGridRangeY(count)) { visit(i); }
174 template <typename T>
GpuGridRangeY(T count)175 __device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
176 return detail::GpuGridRange<T>(
177 /*begin=*/blockIdx.y * blockDim.y + threadIdx.y,
178 /*delta=*/gridDim.y * blockDim.y, /*end=*/count);
179 }
180 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeY, CudaGridRangeY);
181
182 // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
183 // Usage: for(int i : GpuGridRangeZ(count)) { visit(i); }
184 template <typename T>
GpuGridRangeZ(T count)185 __device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
186 return detail::GpuGridRange<T>(
187 /*begin=*/blockIdx.z * blockDim.z + threadIdx.z,
188 /*delta=*/gridDim.z * blockDim.z, /*end=*/count);
189 }
190 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeZ, CudaGridRangeZ);
191
192 // Mask for all 32 threads in a warp.
193 __device__ const unsigned kCudaWarpAll = 0xffffffff;
194 // ROCM TODO add ROCM implementation
195 // Mask for all 64 threads in a wavefront.
196 __device__ const unsigned kGpuWarpAll = 0xffffffff;
197
198 // Returns the warp lane ID of the calling thread
GpuLaneId()199 __device__ inline unsigned GpuLaneId() {
200 unsigned int lane_id;
201 #if GOOGLE_CUDA
202 #if __clang__
203 return __nvvm_read_ptx_sreg_laneid();
204 #else // __clang__
205 asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
206 #endif // __clang__
207 #elif TENSORFLOW_USE_ROCM
208 lane_id = __lane_id();
209 #endif
210 return lane_id;
211 }
212 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLaneId, CudaLaneId);
213
214 namespace detail {
215 // Returns true if mask is a valid parameter for __shfl*sync to return a well
216 // defined value, assuming the calling lane will read from src_lane as part of
217 // the shuffle operation.
218 //
219 // Specifically, returns true iff mask has the calling lane bit and the src_lane
220 // bit set, and the src_lane calls this function with the same mask value
221 // (required for the two threads to wait for each other).
222 //
223 // On Volta, for some invalid masks, this function hangs or returns false
224 // positives, because the implementation shuffles with the same mask that
225 // we are validating. Run on Pascal if you suspect that the mask is incorrect.
GpuValidateShuffleSyncMask(unsigned mask,unsigned src_lane)226 __device__ inline bool GpuValidateShuffleSyncMask(unsigned mask,
227 unsigned src_lane) {
228 unsigned src_dst_mask = 1u << GpuLaneId() | 1u << src_lane;
229 #if CUDA_VERSION >= 9000
230 unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
231 #else
232 #if GOOGLE_CUDA
233 unsigned src_lane_mask = __shfl(mask, src_lane);
234 #elif TENSORFLOW_USE_ROCM
235 unsigned src_lane_mask =
236 __shfl(static_cast<int>(mask), static_cast<int>(src_lane));
237 #endif
238 #endif
239 return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
240 }
241 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuValidateShuffleSyncMask,
242 CudaValidateShuffleSyncMask);
243
244 // Returns the actual source lane for shuffle.
GpuShuffleGetSrcLane(int src_lane,int width)245 __device__ inline unsigned GpuShuffleGetSrcLane(int src_lane, int width) {
246 int lane_id = GpuLaneId();
247 int lane_base = lane_id & ~width + 1;
248 int lane_offset = src_lane & width - 1;
249 return lane_base + lane_offset;
250 }
251 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleGetSrcLane, CudaShuffleGetSrcLane);
252
253 // Returns the source lane for shuffle up.
GpuShuffleUpGetSrcLane(unsigned delta,int width)254 __device__ inline unsigned GpuShuffleUpGetSrcLane(unsigned delta, int width) {
255 unsigned lane_id = GpuLaneId();
256 if ((lane_id & width - 1) < delta) {
257 return lane_id;
258 }
259 return lane_id - delta;
260 }
261 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpGetSrcLane,
262 CudaShuffleUpGetSrcLane);
263
264 // Returns the source lane for shuffle down.
GpuShuffleDownGetSrcLane(unsigned delta,int width)265 __device__ inline unsigned GpuShuffleDownGetSrcLane(unsigned delta, int width) {
266 unsigned lane_id = GpuLaneId();
267 if ((lane_id & width - 1) + delta >= width) {
268 return lane_id;
269 }
270 return lane_id + delta;
271 }
272 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownGetSrcLane,
273 CudaShuffleDownGetSrcLane);
274
275 // Returns the source lane for shuffle xor.
GpuShuffleXorGetSrcLane(int lane_mask,int width)276 __device__ inline unsigned GpuShuffleXorGetSrcLane(int lane_mask, int width) {
277 int lane_id = GpuLaneId();
278 int src_lane = lane_id ^ lane_mask;
279 if (src_lane > (lane_id | width - 1)) {
280 return lane_id;
281 }
282 return src_lane;
283 }
284 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorGetSrcLane,
285 CudaShuffleXorGetSrcLane);
286 } // namespace detail
287
288 // For all *_sync wrappers below, it is illegal to synchronize threads from
289 // different program locations, because that is not supported before sm_70.
290 // In other words, all threads in 'mask' must call the functions in convergence.
291 // Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
292 //
293 // It is also illegal to shuffle with a mask that produces an undefined result
294 // for any of the threads. Specifically, all source threads of the shuffle
295 // must have their corresponding bit in 'mask' set.
296
297 // Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
298 __device__ inline void GpuSyncWarp(unsigned mask = kCudaWarpAll) {
299 assert(mask & 1u << GpuLaneId());
300 #if CUDA_VERSION >= 9000
301 __syncwarp(mask);
302 #endif
303 }
304 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuSyncWarp, CudaSyncWarp);
305
306 // Wrapper for __ballot_sync. All threads in 'mask' must call this function in
307 // convergence, see comment above for details.
GpuBallotSync(unsigned mask,int pred)308 __device__ inline unsigned GpuBallotSync(unsigned mask, int pred) {
309 assert(mask & 1u << GpuLaneId());
310 #if CUDA_VERSION >= 9000
311 return __ballot_sync(mask, pred);
312 #else
313 return __ballot(pred) & mask; // Apply mask to match __ballot_sync's spec.
314 #endif
315 }
316 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuBallotSync, CudaBallotSync);
317
318 // Wrapper for __any_sync. All threads in 'mask' must call this function in
319 // convergence, see comment above for details.
GpuAnySync(unsigned mask,int pred)320 __device__ inline int GpuAnySync(unsigned mask, int pred) {
321 assert(mask & 1u << GpuLaneId());
322 #if CUDA_VERSION >= 9000
323 return __any_sync(mask, pred);
324 #else
325 return __any(pred);
326 #endif
327 }
328 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAnySync, CudaAnySync);
329
330 // Wrapper for __all_sync. All threads in 'mask' must call this function in
331 // convergence, see comment above for details.
GpuAllSync(unsigned mask,int pred)332 __device__ inline int GpuAllSync(unsigned mask, int pred) {
333 assert(mask & 1u << GpuLaneId());
334 #if CUDA_VERSION >= 9000
335 return __all_sync(mask, pred);
336 #else
337 return __all(pred);
338 #endif
339 }
340 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAllSync, CudaAllSync);
341
342 // Wrapper for __shfl_sync. All threads in 'mask' must call this function in
343 // convergence, see comment above for details.
344 template <typename T>
345 __device__ T GpuShuffleSync(unsigned mask, T value, int src_lane,
346 int width = warpSize) {
347 assert(!(width & width - 1));
348 assert(detail::GpuValidateShuffleSyncMask(
349 mask, detail::GpuShuffleGetSrcLane(src_lane, width)));
350 #if CUDA_VERSION >= 9000
351 return __shfl_sync(mask, value, src_lane, width);
352 #else
353 return __shfl(value, src_lane, width);
354 #endif
355 }
356
357 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
358 // instead of float for lo and hi (which is incorrect with ftz, for example).
359 // See b/69446944.
360 __device__ inline double GpuShuffleSync(unsigned mask, double value,
361 int src_lane, int width = warpSize) {
362 #if GOOGLE_CUDA
363 auto tmp = __double_as_longlong(value);
364 auto lo = static_cast<unsigned>(tmp);
365 auto hi = static_cast<unsigned>(tmp >> 32);
366 hi = GpuShuffleSync(mask, hi, src_lane, width);
367 lo = GpuShuffleSync(mask, lo, src_lane, width);
368 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
369 #elif TENSORFLOW_USE_ROCM
370 auto tmp = static_cast<uint64_t>(value);
371 auto lo = static_cast<unsigned>(tmp);
372 auto hi = static_cast<unsigned>(tmp >> 32);
373 hi = __shfl(static_cast<int>(hi), src_lane, width);
374 lo = __shfl(static_cast<int>(lo), src_lane, width);
375 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
376 static_cast<uint64_t>(lo));
377 #endif
378 }
379 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleSync, CudaShuffleSync);
380
381 // Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
382 // convergence, see comment above for details.
383 template <typename T>
384 __device__ inline T GpuShuffleUpSync(unsigned mask, T value, unsigned delta,
385 int width = warpSize) {
386 assert(!(width & width - 1));
387 assert(detail::GpuValidateShuffleSyncMask(
388 mask, detail::GpuShuffleUpGetSrcLane(delta, width)));
389 #if CUDA_VERSION >= 9000
390 return __shfl_up_sync(mask, value, delta, width);
391 #else
392 return __shfl_up(value, delta, width);
393 #endif
394 }
395
396 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
397 // instead of float for lo and hi (which is incorrect with ftz, for example).
398 // See b/69446944.
399 __device__ inline double GpuShuffleUpSync(unsigned mask, double value,
400 unsigned delta,
401 int width = warpSize) {
402 #if GOOGLE_CUDA
403 auto tmp = __double_as_longlong(value);
404 auto lo = static_cast<unsigned>(tmp);
405 auto hi = static_cast<unsigned>(tmp >> 32);
406 hi = GpuShuffleUpSync(mask, hi, delta, width);
407 lo = GpuShuffleUpSync(mask, lo, delta, width);
408 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
409 #elif TENSORFLOW_USE_ROCM
410 auto tmp = static_cast<uint64_t>(value);
411 auto lo = static_cast<unsigned>(tmp);
412 auto hi = static_cast<unsigned>(tmp >> 32);
413 hi = __shfl_up(static_cast<int>(hi), delta, width);
414 lo = __shfl_up(static_cast<int>(lo), delta, width);
415 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
416 static_cast<uint64_t>(lo));
417 #endif
418 }
419 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync);
420
421 // Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
422 // in convergence, see comment above for details.
423 template <typename T>
424 __device__ inline T GpuShuffleDownSync(unsigned mask, T value, unsigned delta,
425 int width = warpSize) {
426 assert(!(width & width - 1));
427 assert(detail::GpuValidateShuffleSyncMask(
428 mask, detail::GpuShuffleDownGetSrcLane(delta, width)));
429 #if CUDA_VERSION >= 9000
430 return __shfl_down_sync(mask, value, delta, width);
431 #else
432 return __shfl_down(value, delta, width);
433 #endif
434 }
435
436 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
437 // instead of float for lo and hi (which is incorrect with ftz, for example).
438 // See b/69446944.
439 __device__ inline double GpuShuffleDownSync(unsigned mask, double value,
440 unsigned delta,
441 int width = warpSize) {
442 #if GOOGLE_CUDA
443 auto tmp = __double_as_longlong(value);
444 auto lo = static_cast<unsigned>(tmp);
445 auto hi = static_cast<unsigned>(tmp >> 32);
446 hi = GpuShuffleDownSync(mask, hi, delta, width);
447 lo = GpuShuffleDownSync(mask, lo, delta, width);
448 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
449 #elif TENSORFLOW_USE_ROCM
450 auto tmp = static_cast<uint64_t>(value);
451 auto lo = static_cast<unsigned>(tmp);
452 auto hi = static_cast<unsigned>(tmp >> 32);
453 hi = __shfl_down(static_cast<int>(hi), delta, width);
454 lo = __shfl_down(static_cast<int>(lo), delta, width);
455 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
456 static_cast<uint64_t>(lo));
457 #endif
458 }
459 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync);
460
461 // Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
462 // convergence, see comment above for details.
463 template <typename T>
464 __device__ T GpuShuffleXorSync(unsigned mask, T value, int lane_mask,
465 int width = warpSize) {
466 assert(!(width & width - 1));
467 assert(detail::GpuValidateShuffleSyncMask(
468 mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
469 #if GOOGLE_CUDA
470 #if CUDA_VERSION >= 9000
471 return __shfl_xor_sync(mask, value, lane_mask, width);
472 #else
473 return __shfl_xor(value, lane_mask, width);
474 #endif
475 #elif TENSORFLOW_USE_ROCM
476 // ROCM TODO: check if HIP should be changed to cope with more types
477 return __shfl_xor(static_cast<int>(value), lane_mask, width);
478 #endif
479 }
480
481 #if TENSORFLOW_USE_ROCM
482 __device__ inline Eigen::half GpuShuffleXorSync(unsigned mask,
483 Eigen::half value,
484 int lane_mask,
485 int width = warpSize) {
486 assert(!(width & width - 1));
487 assert(detail::GpuValidateShuffleSyncMask(
488 mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
489 // TODO(rocm): This doesn't preserve NaN payload and flushes denorms to zero,
490 // maybe this should be implemented differently?
491 return static_cast<Eigen::half>(
492 __shfl_xor(static_cast<float>(value), lane_mask, width));
493 }
494 #endif
495
496 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
497 // instead of float for lo and hi (which is incorrect with ftz, for example).
498 // See b/69446944.
499 __device__ inline double GpuShuffleXorSync(unsigned mask, double value,
500 int lane_mask,
501 int width = warpSize) {
502 #if GOOGLE_CUDA
503 auto tmp = __double_as_longlong(value);
504 auto lo = static_cast<unsigned>(tmp);
505 auto hi = static_cast<unsigned>(tmp >> 32);
506 hi = GpuShuffleXorSync(mask, hi, lane_mask, width);
507 lo = GpuShuffleXorSync(mask, lo, lane_mask, width);
508 return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
509 #elif TENSORFLOW_USE_ROCM
510 auto tmp = static_cast<uint64_t>(value);
511 auto lo = static_cast<unsigned>(tmp);
512 auto hi = static_cast<unsigned>(tmp >> 32);
513 hi = __shfl_xor(static_cast<int>(hi), lane_mask, width);
514 lo = __shfl_xor(static_cast<int>(lo), lane_mask, width);
515 return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
516 static_cast<uint64_t>(lo));
517 #endif
518 }
519 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorSync, CudaShuffleXorSync);
520
521 // Wrapper for __ldg.
522 template <typename T>
GpuLdg(const T * address)523 __host__ __device__ T GpuLdg(const T* address) {
524 #if __CUDA_ARCH__ >= 350
525 return __ldg(address);
526 #else
527 return *address;
528 #endif
529 }
530
GpuLdg(const bool * address)531 __host__ __device__ inline bool GpuLdg(const bool* address) {
532 return GpuLdg(reinterpret_cast<const char*>(address)) != 0;
533 }
534
GpuLdg(const std::complex<float> * address)535 __host__ __device__ inline std::complex<float> GpuLdg(
536 const std::complex<float>* address) {
537 #if __CUDA_ARCH__ >= 350
538 float2 mem = __ldg(reinterpret_cast<const float2*>(address));
539 return std::complex<float>(mem.x, mem.y);
540 #else
541 return *address;
542 #endif
543 }
544
GpuLdg(const std::complex<double> * address)545 __host__ __device__ inline std::complex<double> GpuLdg(
546 const std::complex<double>* address) {
547 #if __CUDA_ARCH__ >= 350
548 double2 mem = __ldg(reinterpret_cast<const double2*>(address));
549 return std::complex<double>(mem.x, mem.y);
550 #else
551 return *address;
552 #endif
553 }
554 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLdg, CudaLdg);
555
556 // Zeroes count elements starting at ptr using all threads of a 1-D grid.
557 // Note: this function does not synchronize, and therefore the memory range is
558 // not guaranteed to be zero until the next kernel launch.
559 template <typename T>
SetZero(const int count,T * __restrict__ ptr)560 __global__ void SetZero(const int count, T* __restrict__ ptr) {
561 // Check that the grid is one dimensional and index doesn't overflow.
562 assert(blockDim.y == 1);
563 assert(blockDim.z == 1);
564 assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
565 for (int i : GpuGridRangeX(count)) {
566 ptr[i] = T(0);
567 }
568 }
569
570 // Helper to set all tensor entries to a specific value.
571 template <typename T>
SetToValue(const int count,T * __restrict__ ptr,T value)572 __global__ void SetToValue(const int count, T* __restrict__ ptr, T value) {
573 // Check that the grid is one dimensional and index doesn't overflow.
574 assert(blockDim.y == 1);
575 assert(blockDim.z == 1);
576 assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
577 for (int i : GpuGridRangeX(count)) {
578 ptr[i] = value;
579 }
580 }
581
582 namespace detail {
583 // Helper function for atomic accumulation implemented as CAS.
584 template <typename T, typename F>
GpuAtomicCasHelper(T * ptr,F accumulate)585 __device__ T GpuAtomicCasHelper(T* ptr, F accumulate) {
586 T old = *ptr;
587 T assumed;
588 do {
589 assumed = old;
590 old = atomicCAS(ptr, assumed, accumulate(assumed));
591 } while (assumed != old);
592 return old;
593 }
594 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicCasHelper, CudaAtomicCasHelper);
595
596 // Overload for floating point (using integer comparison to handle NaN
597 // correctly).
598 template <typename F>
GpuAtomicCasHelper(float * ptr,F accumulate)599 __device__ float GpuAtomicCasHelper(float* ptr, F accumulate) {
600 return __int_as_float(
601 GpuAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
602 return __float_as_int(accumulate(__int_as_float(a)));
603 }));
604 }
605 template <typename F>
GpuAtomicCasHelper(double * ptr,F accumulate)606 __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
607 #if TENSORFLOW_USE_ROCM
608 // FIXME: remove the workaround below once bug is fixed.
609 // HIP has a bug in the implementation of __longlong_as_double
610 // So workaround it by using reinterpret_cast<double*>.
611 uint64_t result =
612 GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
613 [accumulate](tensorflow::uint64 a) {
614 return __double_as_longlong(
615 accumulate(*(reinterpret_cast<double*>(&a))));
616 });
617 return *(reinterpret_cast<double*>(&result));
618 #else
619 return __longlong_as_double(GpuAtomicCasHelper(
620 reinterpret_cast<unsigned long long*>(ptr),
621 [accumulate](tensorflow::uint64 a) {
622 return __double_as_longlong(accumulate(__longlong_as_double(a)));
623 }));
624 #endif
625 }
626
627 // Overload of above function for half. Note that we don't have
628 // atomicCAS() for anything less than 32 bits, so we need to include the
629 // other 16 bits in the operation.
630 //
631 // This version is going to be very slow
632 // under high concurrency, since most threads will be spinning on failing
633 // their compare-and-swap tests. (The fact that we get false sharing on the
634 // neighboring fp16 makes this even worse.) If you are doing a large reduction,
635 // you are much better off with doing the intermediate steps in fp32 and then
636 // switching to fp16 as late as you can in the calculations.
637 //
638 // Note: Assumes little endian.
639 template <typename F>
GpuAtomicCasHelper(Eigen::half * ptr,F accumulate)640 __device__ Eigen::half GpuAtomicCasHelper(Eigen::half* ptr, F accumulate) {
641 #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
642 static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
643 #endif
644 namespace half_impl = Eigen::half_impl;
645 intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
646 assert(!(intptr & 0x1)); // should be 2-aligned.
647 if (intptr & 0x2) {
648 // The half is in the second part of the uint32 (upper 16 bits).
649 uint32* address = reinterpret_cast<uint32*>(intptr - 2);
650 uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
651 unsigned short high = static_cast<unsigned short>(arg >> 16);
652 Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
653 return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
654 });
655 return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
656 } else {
657 // The half is in the first part of the uint32 (lower 16 bits).
658 uint32* address = reinterpret_cast<uint32*>(intptr);
659 uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
660 unsigned short low = static_cast<unsigned short>(arg & 0xffff);
661 Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
662 return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
663 });
664 return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
665 }
666 }
667
668 template <typename F>
GpuAtomicCasHelper(long long * ptr,F accumulate)669 __device__ long long GpuAtomicCasHelper(long long* ptr, F accumulate) {
670 return static_cast<long long>(
671 GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
672 [accumulate](unsigned long long a) {
673 return static_cast<unsigned long long>(
674 accumulate(static_cast<long long>(a)));
675 }));
676 }
677
678 template <typename From, typename To>
679 using ToTypeIfConvertible =
680 typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
681
682 template <typename T>
683 struct CudaSupportedTypeImpl {
684 using type = T;
685 };
686
687 template <>
688 struct CudaSupportedTypeImpl<long long> {
689 using type = unsigned long long;
690 };
691
692 template <>
693 struct CudaSupportedTypeImpl<unsigned long> {
694 using type =
695 typename std::conditional<sizeof(unsigned long) == sizeof(unsigned int),
696 unsigned int, unsigned long long>::type;
697 };
698
699 template <>
700 struct CudaSupportedTypeImpl<long> {
701 // This cast should be safe since module-2 addition should work fine. However,
702 // signed overflow is not handled correctly since it's undefined behavior.
703 using type = typename CudaSupportedTypeImpl<unsigned long>::type;
704 };
705
706 template <typename T>
707 using CudaSupportedType = typename CudaSupportedTypeImpl<T>::type;
708
709 template <typename T>
710 __device__ CudaSupportedType<T>* ToCudaSupportedPtr(T* ptr) {
711 return reinterpret_cast<CudaSupportedType<T>*>(ptr);
712 }
713
714 } // namespace detail
715
716 // CUDA provides atomic ops, but not for all types. We provide wrappers
717 // for some ops and provide implementation for all reasonable types.
718
719 template <typename T, typename U>
720 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
721 return atomicAdd(detail::ToCudaSupportedPtr(ptr), value);
722 }
723
724 __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
725 Eigen::half value) {
726 return detail::GpuAtomicCasHelper(
727 ptr, [value](Eigen::half a) { return a + value; });
728 }
729
730 #if (__CUDA_ARCH__ < 600) || TENSORFLOW_USE_ROCM
731 __device__ inline double GpuAtomicAdd(double* ptr, double value) {
732 return detail::GpuAtomicCasHelper(ptr,
733 [value](double a) { return a + value; });
734 }
735 #endif
736
737 // GpuAtomicAdd
738 // Specializations of GpuAtomicAdd for complex types, which GpuAtomicAdd does
739 // not support. We treat a std::complex<T>* as a T* (the C++ standard section
740 // 26.4.4 allows this explicitly) and atomic add the real and imaginary
741 // components individually. The operation as a whole is not atomic, but we can
742 // safely treat the components independently for the purpose of accumulating.
743
744 // ROCM TODO support GpuAtomicAdd for std::complex<>
745 #if GOOGLE_CUDA
746 __device__ inline std::complex<float> GpuAtomicAdd(std::complex<float>* ptr,
747 std::complex<float> value) {
748 auto ptr_scalar = reinterpret_cast<float*>(ptr);
749 return std::complex<float>(GpuAtomicAdd(ptr_scalar, value.real()),
750 GpuAtomicAdd(ptr_scalar + 1, value.imag()));
751 }
752
753 __device__ inline std::complex<double> GpuAtomicAdd(
754 std::complex<double>* ptr, std::complex<double> value) {
755 auto ptr_scalar = reinterpret_cast<double*>(ptr);
756 return std::complex<double>(GpuAtomicAdd(ptr_scalar, value.real()),
757 GpuAtomicAdd(ptr_scalar + 1, value.imag()));
758 }
759 #endif
760 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicAdd, CudaAtomicAdd);
761
762 // GpuAtomicSub
763 template <typename T, typename U>
764 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicSub(T* ptr, U value) {
765 return atomicSub(ptr, value);
766 }
767
768 // Specializations of substraction which add the negative value.
769 __device__ inline float GpuAtomicSub(float* ptr, float value) {
770 return GpuAtomicAdd(ptr, -value);
771 }
772
773 __device__ inline double GpuAtomicSub(double* ptr, double value) {
774 return GpuAtomicAdd(ptr, -value);
775 }
776
777 __device__ inline tensorflow::int64 GpuAtomicSub(tensorflow::int64* ptr,
778 tensorflow::int64 value) {
779 return GpuAtomicAdd(ptr, -value);
780 }
781
782 __device__ inline tensorflow::uint64 GpuAtomicSub(tensorflow::uint64* ptr,
783 tensorflow::uint64 value) {
784 return GpuAtomicAdd(ptr, -static_cast<tensorflow::int64>(value));
785 }
786
787 __device__ inline Eigen::half GpuAtomicSub(Eigen::half* ptr,
788 Eigen::half value) {
789 return detail::GpuAtomicCasHelper(
790 ptr, [value](Eigen::half a) { return a - value; });
791 }
792 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
793
794 // GpuAtomicMax
795 template <typename T, typename U>
796 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
797 return atomicMax(detail::ToCudaSupportedPtr(ptr), value);
798 }
799
800 #if TENSORFLOW_USE_ROCM
801
802 /*
803 * CUDA runtime headers have the following defined
804 * __device__ int max(int, int)
805 * __device__ float max(float, float)
806 * __device__ double max(double, double)
807 *
808 * and many others, where as HIP runtime headers only have the "int" version
809 *
810 * Therefore need to special case ROCm version to call the correct underlying
811 * routines for float and double types.
812 *
813 */
814
815 __device__ inline float GpuAtomicMax(float* ptr, float value) {
816 return detail::GpuAtomicCasHelper(
817 ptr, [value](float a) { return fmaxf(a, value); });
818 }
819
820 __device__ inline double GpuAtomicMax(double* ptr, double value) {
821 return detail::GpuAtomicCasHelper(
822 ptr, [value](double a) { return fmax(a, value); });
823 }
824
825 #else
826
827 __device__ inline float GpuAtomicMax(float* ptr, float value) {
828 return detail::GpuAtomicCasHelper(ptr,
829 [value](float a) { return max(a, value); });
830 }
831
832 __device__ inline double GpuAtomicMax(double* ptr, double value) {
833 return detail::GpuAtomicCasHelper(
834 ptr, [value](double a) { return max(a, value); });
835 }
836
837 #endif
838
839 __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
840 Eigen::half value) {
841 return detail::GpuAtomicCasHelper(
842 ptr, [value](Eigen::half a) { return max(a, value); });
843 }
844
845 #if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
846 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
847 tensorflow::uint64 value) {
848 return detail::GpuAtomicCasHelper(
849 detail::ToCudaSupportedPtr(ptr),
850 [value](tensorflow::uint64 a) { return max(a, value); });
851 }
852
853 __device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
854 return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
855 [value](int64 a) { return max(a, value); });
856 }
857 #endif
858 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
859
860 // GpuAtomicMin
861 template <typename T, typename U>
862 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
863 return atomicMin(detail::ToCudaSupportedPtr(ptr), value);
864 }
865
866 #if TENSORFLOW_USE_ROCM
867
868 /*
869 * CUDA runtime headers have the following defined
870 * __device__ int min(int, int)
871 * __device__ float min(float, float)
872 * __device__ double min(double, double)
873 *
874 * and many others, where as HIP runtime headers only have the "int" version
875 *
876 * Therefore need to special case ROCm version to call the correct underlying
877 * routines for float and double types.
878 *
879 */
880
881 __device__ inline float GpuAtomicMin(float* ptr, float value) {
882 return detail::GpuAtomicCasHelper(
883 ptr, [value](float a) { return fminf(a, value); });
884 }
885
886 __device__ inline double GpuAtomicMin(double* ptr, double value) {
887 return detail::GpuAtomicCasHelper(
888 ptr, [value](double a) { return fmin(a, value); });
889 }
890
891 #else
892
893 __device__ inline float GpuAtomicMin(float* ptr, float value) {
894 return detail::GpuAtomicCasHelper(ptr,
895 [value](float a) { return min(a, value); });
896 }
897
898 __device__ inline double GpuAtomicMin(double* ptr, double value) {
899 return detail::GpuAtomicCasHelper(
900 ptr, [value](double a) { return min(a, value); });
901 }
902
903 #endif
904
905 __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
906 Eigen::half value) {
907 return detail::GpuAtomicCasHelper(
908 ptr, [value](Eigen::half a) { return min(a, value); });
909 }
910
911 #if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
912 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
913 tensorflow::uint64 value) {
914 return detail::GpuAtomicCasHelper(
915 detail::ToCudaSupportedPtr(ptr),
916 [value](tensorflow::uint64 a) { return min(a, value); });
917 }
918
919 __device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
920 return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
921 [value](int64 a) { return min(a, value); });
922 }
923 #endif
924 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMin, CudaAtomicMin);
925
926 // GpuAtomicMul
927 template <typename T, typename U>
928 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMul(T* ptr, U value) {
929 return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a * value; });
930 }
931 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMul, CudaAtomicMul);
932
933 // GpuAtomicDiv
934 template <typename T, typename U>
935 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicDiv(T* ptr, U value) {
936 return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a / value; });
937 }
938 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicDiv, CudaAtomicDiv);
939
940 // Operator overloads for complex numbers.
941 #if GOOGLE_CUDA
942 __device__ inline std::complex<float> operator+(const std::complex<float>& a,
943 const std::complex<float>& b) {
944 auto result = cuCaddf(make_cuComplex(a.real(), a.imag()),
945 make_cuComplex(b.real(), b.imag()));
946 return std::complex<float>(result.x, result.y);
947 }
948
949 __device__ inline std::complex<float> operator-(const std::complex<float>& a,
950 const std::complex<float>& b) {
951 auto result = cuCsubf(make_cuComplex(a.real(), a.imag()),
952 make_cuComplex(b.real(), b.imag()));
953 return std::complex<float>(result.x, result.y);
954 }
955
956 __device__ inline std::complex<float> operator*(const std::complex<float>& a,
957 const std::complex<float>& b) {
958 auto result = cuCmulf(make_cuComplex(a.real(), a.imag()),
959 make_cuComplex(b.real(), b.imag()));
960 return std::complex<float>(result.x, result.y);
961 }
962
963 __device__ inline std::complex<float> operator/(const std::complex<float>& a,
964 const std::complex<float>& b) {
965 auto result = cuCdivf(make_cuComplex(a.real(), a.imag()),
966 make_cuComplex(b.real(), b.imag()));
967 return std::complex<float>(result.x, result.y);
968 }
969
970 __device__ inline std::complex<double> operator+(
971 const std::complex<double>& a, const std::complex<double>& b) {
972 auto result = cuCadd(make_cuDoubleComplex(a.real(), a.imag()),
973 make_cuDoubleComplex(b.real(), b.imag()));
974 return std::complex<double>(result.x, result.y);
975 }
976
977 __device__ inline std::complex<double> operator-(
978 const std::complex<double>& a, const std::complex<double>& b) {
979 auto result = cuCsub(make_cuDoubleComplex(a.real(), a.imag()),
980 make_cuDoubleComplex(b.real(), b.imag()));
981 return std::complex<double>(result.x, result.y);
982 }
983
984 __device__ inline std::complex<double> operator*(
985 const std::complex<double>& a, const std::complex<double>& b) {
986 auto result = cuCmul(make_cuDoubleComplex(a.real(), a.imag()),
987 make_cuDoubleComplex(b.real(), b.imag()));
988 return std::complex<double>(result.x, result.y);
989 }
990
991 __device__ inline std::complex<double> operator/(
992 const std::complex<double>& a, const std::complex<double>& b) {
993 auto result = cuCdiv(make_cuDoubleComplex(a.real(), a.imag()),
994 make_cuDoubleComplex(b.real(), b.imag()));
995 return std::complex<double>(result.x, result.y);
996 }
997 #endif // GOOGLE_CUDA
998
999 namespace functor {
1000 // ROCm hcc(clang) has severe difficulties dealing with std::complex directly
1001 // due to a header issue. This template assists in casting std::complex into the
1002 // corresponding internal ROCm types.
1003 template <class T>
1004 struct MapComplexToHipComplex {
1005 typedef T TM;
1006 };
1007
1008 #if TENSORFLOW_USE_ROCM
1009 template <>
1010 struct MapComplexToHipComplex<std::complex<float> > {
1011 typedef hipFloatComplex TM;
1012 };
1013
1014 template <>
1015 struct MapComplexToHipComplex<std::complex<double> > {
1016 typedef hipDoubleComplex TM;
1017 };
1018 #endif
1019 }; // namespace functor
1020
1021 } // namespace tensorflow
1022
1023 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1024 #endif // TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
1025