• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA
17 
18 #define EIGEN_USE_GPU
19 
20 #include <algorithm>
21 
22 #include "tensorflow/core/framework/register_types.h"
23 #include "tensorflow/core/kernels/bias_op.h"
24 #include "tensorflow/core/kernels/bias_op_gpu.h"
25 #include "tensorflow/core/util/cuda_kernel_helper.h"
26 
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/framework/tensor_shape.h"
29 #include "tensorflow/core/framework/tensor_types.h"
30 #include "tensorflow/core/platform/types.h"
31 
32 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
33 #include "tensorflow/core/kernels/reduction_ops_common.h"
34 
35 namespace tensorflow {
36 
37 typedef Eigen::GpuDevice GPUDevice;
38 
39 // There are no native fp16 atomics (we simulate them using 32-bit atomics),
40 // so fp16 sums are done in fp32 internally. (We don't have a lot of shared
41 // memory traffic; BiasGradNCHW_SharedAtomics in particular works almost
42 // entirely on a local variable.)
43 template <class T>
44 struct AccumulatorType {
45   typedef T type;
46 };
47 
48 template <>
49 struct AccumulatorType<Eigen::half> {
50   typedef float type;
51 };
52 
53 // Definition of the GPU implementations declared in bias_op.cc.
54 
55 template <typename T>
BiasNHWCKernel(int32 nthreads,const T * input,const T * bias,T * output,int32 bias_size)56 __global__ void BiasNHWCKernel(int32 nthreads, const T* input, const T* bias,
57                                T* output, int32 bias_size) {
58   CUDA_1D_KERNEL_LOOP(index, nthreads) {
59     int32 bias_offset = index % bias_size;
60     output[index] = ldg(input + index) + ldg(bias + bias_offset);
61   }
62 }
63 
64 template <typename T>
BiasNCHWKernel(int32 nthreads,const T * input,const T * bias,T * output,int32 bias_size,int32 image_size)65 __global__ void BiasNCHWKernel(int32 nthreads, const T* input, const T* bias,
66                                T* output, int32 bias_size, int32 image_size) {
67   CUDA_1D_KERNEL_LOOP(index, nthreads) {
68     int32 index2 = index / image_size;
69     int32 bias_offset = index2 % bias_size;
70     output[index] = ldg(input + index) + ldg(bias + bias_offset);
71   }
72 }
73 
74 // Add "bias" to "input", broadcasting it on all dimensions but the bias
75 // dimension.
76 template <typename T>
compute(const GPUDevice & d,const T * input,const T * bias,T * output,int32 batch,int32 height,int32 width,int depth,int32 channel,TensorFormat data_format)77 void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
78                          T* output, int32 batch, int32 height, int32 width,
79                          int depth, int32 channel, TensorFormat data_format) {
80   const int32 bias_size = channel;
81   const int32 image_size = height * width * depth;
82   const int32 total_count = batch * bias_size * image_size;
83   if (total_count == 0) {
84     return;
85   }
86   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
87   if (data_format == FORMAT_NHWC) {
88     TF_CHECK_OK(CudaLaunchKernel(BiasNHWCKernel<T>, config.block_count,
89                                  config.thread_per_block, 0, d.stream(),
90                                  config.virtual_thread_count, input, bias,
91                                  output, bias_size));
92   } else {
93     TF_CHECK_OK(CudaLaunchKernel(BiasNCHWKernel<T>, config.block_count,
94                                  config.thread_per_block, 0, d.stream(),
95                                  config.virtual_thread_count, input, bias,
96                                  output, bias_size, image_size));
97   }
98 }
99 
100 // A naive implementation that is functional on all cases.
101 template <typename T>
BiasGradNHWC_Naive(int32 nthreads,const T * output_backprop,T * bias_backprop,int32 bias_size)102 __global__ void BiasGradNHWC_Naive(int32 nthreads, const T* output_backprop,
103                                    T* bias_backprop, int32 bias_size) {
104   CUDA_1D_KERNEL_LOOP(index, nthreads) {
105     int32 bias_offset = index % bias_size;
106     CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
107   }
108 }
109 
110 // A naive implementation that is functional on all cases.
111 template <typename T>
BiasGradNCHW_Naive(int32 nthreads,const T * output_backprop,T * bias_backprop,int32 bias_size,int32 image_size)112 __global__ void BiasGradNCHW_Naive(int32 nthreads, const T* output_backprop,
113                                    T* bias_backprop, int32 bias_size,
114                                    int32 image_size) {
115   CUDA_1D_KERNEL_LOOP(index, nthreads) {
116     int32 index2 = index / image_size;
117     int32 bias_offset = index2 % bias_size;
118     CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
119   }
120 }
121 
122 extern __shared__ char s_buf[];
123 
124 template <typename T>
BiasGradNHWC_SharedAtomics(int32 nthreads,const T * output_backprop,T * bias_backprop,int32 bias_size)125 __global__ void BiasGradNHWC_SharedAtomics(int32 nthreads,
126                                            const T* output_backprop,
127                                            T* bias_backprop, int32 bias_size) {
128   typedef typename AccumulatorType<T>::type AccT;
129   AccT* s_data = reinterpret_cast<AccT*>(s_buf);
130   for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
131     s_data[index] = AccT(0);
132   }
133   __syncthreads();
134 
135   for (int32 index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
136        index += blockDim.x * gridDim.x) {
137     int32 bias_offset = index % bias_size;
138     CudaAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index)));
139   }
140   __syncthreads();
141 
142   for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
143     CudaAtomicAdd(bias_backprop + index, T(s_data[index]));
144   }
145 }
146 
147 template <typename T>
BiasGradNCHW_SharedAtomics(const T * output_backprop,T * bias_backprop,int32 batch,int32 bias_size,int32 image_size,int group_size)148 __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
149                                            T* bias_backprop, int32 batch,
150                                            int32 bias_size, int32 image_size,
151                                            int group_size) {
152   // Initialize the shared memory.
153   typedef typename AccumulatorType<T>::type AccT;
154   const int32 kSDataSize = 32;
155   __shared__ AccT s_data[kSDataSize];
156   for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
157     s_data[index] = AccT(0);
158   }
159   __syncthreads();
160 
161   // Accumulate all the values within this thread. They all have the same bias
162   // index.
163   int32 bias_index = blockIdx.x % bias_size;
164   int32 group_index = blockIdx.x / bias_size;
165   int32 total_count = batch * image_size;
166   AccT sum(0);
167   for (int32 index = group_index * blockDim.x + threadIdx.x;
168        index < total_count; index += blockDim.x * group_size) {
169     int32 image_offset = index % image_size;
170     int32 batch = index / image_size;
171     T val = ldg(output_backprop +
172                 (batch * bias_size + bias_index) * image_size + image_offset);
173     sum += AccT(val);
174   }
175 
176   // Write the accumulated sum in this thread to the shared memory. Each thread
177   // shifts their write location to avoid bank conflict.
178   int bias_offset = threadIdx.x % 32;
179   CudaAtomicAdd(s_data + bias_offset, sum);
180   __syncthreads();
181 
182   // Accumulate the results in the shared memory into the first element.
183   // No syncthreads is needed since this is only in the same warp.
184   int32 thread_index = threadIdx.x;
185   if (thread_index < 32) {
186     AccT data = s_data[thread_index];
187     for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
188       data += CudaShuffleXorSync(kCudaWarpAll, data, delta);
189     }
190     if (thread_index == 0) {
191       CudaAtomicAdd(bias_backprop + bias_index, T(data));
192     }
193   }
194 }
195 
196 template <typename T>
compute(const GPUDevice & d,const T * output_backprop,T * bias_backprop,int32 batch,int32 height,int32 width,int32 depth,int32 channel,TensorFormat data_format)197 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
198                              T* bias_backprop, int32 batch, int32 height,
199                              int32 width, int32 depth, int32 channel,
200                              TensorFormat data_format) {
201   const int32 bias_size = channel;
202   const int32 image_size = height * width * depth;
203   const int32 total_count = batch * bias_size * image_size;
204   if (total_count == 0) {
205     return;
206   }
207   static constexpr int32 kWarpSize = 32;
208   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
209 
210   const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
211   int32 shared_memory_size = 0;
212   if (data_format == FORMAT_NHWC) {
213     shared_memory_size = bias_size * sizeof(typename AccumulatorType<T>::type);
214   }
215   // Check if we have enough shared memory.
216   if (shared_memory_size <= max_shared_memory_size) {
217     if (data_format == FORMAT_NHWC) {
218       BiasGradNHWC_SharedAtomics<T>
219           <<<config.block_count, config.thread_per_block, shared_memory_size,
220              d.stream()>>>(total_count, output_backprop, bias_backprop,
221                            bias_size);
222     } else {
223       // Round up the block count to multiple of bias_size.
224       int group_size = (config.block_count + bias_size - 1) / bias_size;
225       config.block_count = group_size * bias_size;
226       if (config.thread_per_block < kWarpSize) {
227         config.thread_per_block = kWarpSize;
228       }
229       TF_CHECK_OK(CudaLaunchKernel(
230           BiasGradNCHW_SharedAtomics<T>, config.block_count,
231           config.thread_per_block, 0, d.stream(), output_backprop,
232           bias_backprop, batch, bias_size, image_size, group_size));
233     }
234   } else {
235     // Note that even if we don't have enough shared memory to fit the entire
236     // output block, it is possible to process one group of elements at a time.
237     // But for now, we simply fall back to the naive implementation.
238     if (data_format == FORMAT_NHWC) {
239       TF_CHECK_OK(CudaLaunchKernel(
240           BiasGradNHWC_Naive<T>, config.block_count, config.thread_per_block, 0,
241           d.stream(), total_count, output_backprop, bias_backprop, bias_size));
242     } else {
243       TF_CHECK_OK(CudaLaunchKernel(BiasGradNCHW_Naive<T>, config.block_count,
244                                    config.thread_per_block, 0, d.stream(),
245                                    total_count, output_backprop, bias_backprop,
246                                    bias_size, image_size));
247     }
248   }
249 }
250 
251 template <typename T>
DoRowReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)252 void BiasGradGPU<T>::DoRowReduction(OpKernelContext* context, T* output,
253                                     const T* input, int rows, int cols) {
254   typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
255   Constants<GPUDevice> constants;
256   cub::Sum op;
257   functor::ReduceImpl<T, cub::Sum, T*, const T*, ReductionAxes>(
258       context, output, input, 2, rows, cols, 1, 1, constants.kOne, op);
259 }
260 
261 template <typename T>
DoColReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)262 void BiasGradGPU<T>::DoColReduction(OpKernelContext* context, T* output,
263                                     const T* input, int rows, int cols) {
264   typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
265   Constants<GPUDevice> constants;
266   cub::Sum op;
267   functor::ReduceImpl<T, cub::Sum, T*, const T*, ReductionAxes>(
268       context, output, input, 2, rows, cols, 1, 1, constants.kZero, op);
269 }
270 
271 #define DEFINE_GPU_SPECS(T)   \
272   template struct BiasGPU<T>; \
273   template struct BiasGradGPU<T>;
274 
275 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
276 
277 }  // end namespace tensorflow
278 
279 #endif  // GOOGLE_CUDA
280