• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 
18 #define EIGEN_USE_GPU
19 
20 #include <algorithm>
21 
22 #include "tensorflow/core/framework/register_types.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_shape.h"
25 #include "tensorflow/core/framework/tensor_types.h"
26 #include "tensorflow/core/kernels/bias_op.h"
27 #include "tensorflow/core/kernels/bias_op_gpu.h"
28 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
29 #include "tensorflow/core/kernels/reduction_ops_common.h"
30 #include "tensorflow/core/platform/types.h"
31 #include "tensorflow/core/util/gpu_kernel_helper.h"
32 
33 namespace tensorflow {
34 
35 typedef Eigen::GpuDevice GPUDevice;
36 
37 // There are no native fp16 atomics (we simulate them using 32-bit atomics),
38 // so fp16 sums are done in fp32 internally. (We don't have a lot of shared
39 // memory traffic; BiasGradNCHW_SharedAtomics in particular works almost
40 // entirely on a local variable.)
41 template <class T>
42 struct AccumulatorType {
43   typedef T type;
44 };
45 
46 template <>
47 struct AccumulatorType<Eigen::half> {
48   typedef float type;
49 };
50 
51 // Definition of the GPU implementations declared in bias_op.cc.
52 
53 template <typename T>
BiasNHWCKernel(int32 nthreads,const T * __restrict__ input,const T * __restrict__ bias,T * __restrict__ output,int32 bias_size)54 __global__ void BiasNHWCKernel(int32 nthreads, const T* __restrict__ input,
55                                const T* __restrict__ bias,
56                                T* __restrict__ output, int32 bias_size) {
57   GPU_1D_KERNEL_LOOP(index, nthreads) {
58     int32 bias_offset = index % bias_size;
59     output[index] = ldg(input + index) + ldg(bias + bias_offset);
60   }
61 }
62 
63 template <typename T>
BiasNCHWKernel(int32 nthreads,const T * __restrict__ input,const T * __restrict__ bias,T * __restrict__ output,int32 bias_size,int32 image_size)64 __global__ void BiasNCHWKernel(int32 nthreads, const T* __restrict__ input,
65                                const T* __restrict__ bias,
66                                T* __restrict__ output, int32 bias_size,
67                                int32 image_size) {
68   GPU_1D_KERNEL_LOOP(index, nthreads) {
69     int32 index2 = index / image_size;
70     int32 bias_offset = index2 % bias_size;
71     output[index] = ldg(input + index) + ldg(bias + bias_offset);
72   }
73 }
74 
75 // Add "bias" to "input", broadcasting it on all dimensions but the bias
76 // dimension.
77 template <typename T>
compute(const GPUDevice & d,const T * input,const T * bias,T * output,int32 batch,int32 height,int32 width,int depth,int32 channel,TensorFormat data_format)78 void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
79                          T* output, int32 batch, int32 height, int32 width,
80                          int depth, int32 channel, TensorFormat data_format) {
81   const int32 bias_size = channel;
82   const int32 image_size = height * width * depth;
83   const int32 total_count = batch * bias_size * image_size;
84   if (total_count == 0) {
85     return;
86   }
87   GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
88   if (data_format == FORMAT_NHWC) {
89     TF_CHECK_OK(GpuLaunchKernel(BiasNHWCKernel<T>, config.block_count,
90                                 config.thread_per_block, 0, d.stream(),
91                                 config.virtual_thread_count, input, bias,
92                                 output, bias_size));
93   } else {
94     TF_CHECK_OK(GpuLaunchKernel(BiasNCHWKernel<T>, config.block_count,
95                                 config.thread_per_block, 0, d.stream(),
96                                 config.virtual_thread_count, input, bias,
97                                 output, bias_size, image_size));
98   }
99 }
100 
101 // A naive implementation that is functional on all cases.
102 template <typename T>
BiasGradNHWC_Naive(int32 nthreads,const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 bias_size)103 __global__ void BiasGradNHWC_Naive(int32 nthreads,
104                                    const T* __restrict__ output_backprop,
105                                    T* __restrict__ bias_backprop,
106                                    int32 bias_size) {
107   GPU_1D_KERNEL_LOOP(index, nthreads) {
108     int32 bias_offset = index % bias_size;
109     GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
110   }
111 }
112 
113 // A naive implementation that is functional on all cases.
114 template <typename T>
BiasGradNCHW_Naive(int32 nthreads,const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 bias_size,int32 image_size)115 __global__ void BiasGradNCHW_Naive(int32 nthreads,
116                                    const T* __restrict__ output_backprop,
117                                    T* __restrict__ bias_backprop,
118                                    int32 bias_size, int32 image_size) {
119   GPU_1D_KERNEL_LOOP(index, nthreads) {
120     int32 index2 = index / image_size;
121     int32 bias_offset = index2 % bias_size;
122     GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
123   }
124 }
125 
126 template <typename T>
BiasGradNHWC_SharedAtomics(int32 nthreads,const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 bias_size)127 __global__ void BiasGradNHWC_SharedAtomics(
128     int32 nthreads, const T* __restrict__ output_backprop,
129     T* __restrict__ bias_backprop, int32 bias_size) {
130   typedef typename AccumulatorType<T>::type AccT;
131   GPU_DYNAMIC_SHARED_MEM_DECL(8, char, s_buf);
132   AccT* s_data = reinterpret_cast<AccT*>(s_buf);
133   for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
134     s_data[index] = AccT(0);
135   }
136   __syncthreads();
137 
138   for (int32 index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
139        index += blockDim.x * gridDim.x) {
140     int32 bias_offset = index % bias_size;
141     GpuAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index)));
142   }
143   __syncthreads();
144 
145   for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
146     GpuAtomicAdd(bias_backprop + index, T(s_data[index]));
147   }
148 }
149 
150 template <typename T>
BiasGradNCHW_SharedAtomics(const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 batch,int32 bias_size,int32 image_size,int group_size)151 __global__ void BiasGradNCHW_SharedAtomics(
152     const T* __restrict__ output_backprop, T* __restrict__ bias_backprop,
153     int32 batch, int32 bias_size, int32 image_size, int group_size) {
154   // Initialize the shared memory.
155   typedef typename AccumulatorType<T>::type AccT;
156   const int32 kSDataSize = 32;
157   __shared__ AccT s_data[kSDataSize];
158   for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
159     s_data[index] = AccT(0);
160   }
161   __syncthreads();
162 
163   // Accumulate all the values within this thread. They all have the same bias
164   // index.
165   int32 bias_index = blockIdx.x % bias_size;
166   int32 group_index = blockIdx.x / bias_size;
167   int32 total_count = batch * image_size;
168   AccT sum(0);
169   for (int32 index = group_index * blockDim.x + threadIdx.x;
170        index < total_count; index += blockDim.x * group_size) {
171     int32 image_offset = index % image_size;
172     int32 batch = index / image_size;
173     T val = ldg(output_backprop +
174                 (batch * bias_size + bias_index) * image_size + image_offset);
175     sum += AccT(val);
176   }
177 
178   // Write the accumulated sum in this thread to the shared memory. Each thread
179   // shifts their write location to avoid bank conflict.
180   int bias_offset = threadIdx.x % 32;
181   GpuAtomicAdd(s_data + bias_offset, sum);
182   __syncthreads();
183 
184   // Accumulate the results in the shared memory into the first element.
185   // No syncthreads is needed since this is only in the same warp.
186   int32 thread_index = threadIdx.x;
187 #if GOOGLE_CUDA
188   if (thread_index < 32) {
189     AccT data = s_data[thread_index];
190     for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
191       data += GpuShuffleXorSync(kCudaWarpAll, data, delta);
192     }
193     if (thread_index == 0) {
194       GpuAtomicAdd(bias_backprop + bias_index, T(data));
195     }
196   }
197 #elif TENSORFLOW_USE_ROCM
198   if (thread_index < 16) s_data[thread_index] += s_data[thread_index + 16];
199   if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
200   if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
201   if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
202   if (thread_index < 1) s_data[thread_index] += s_data[thread_index + 1];
203 
204   // The first thread writes out the accumulated result to the global location.
205   if (thread_index == 0) {
206     GpuAtomicAdd(bias_backprop + bias_index, T(s_data[0]));
207   }
208 #endif
209 }
210 
211 template <typename T>
compute(const GPUDevice & d,const T * output_backprop,T * bias_backprop,int32 batch,int32 height,int32 width,int32 depth,int32 channel,TensorFormat data_format)212 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
213                              T* bias_backprop, int32 batch, int32 height,
214                              int32 width, int32 depth, int32 channel,
215                              TensorFormat data_format) {
216   const int32 bias_size = channel;
217   const int32 image_size = height * width * depth;
218   const int32 total_count = batch * bias_size * image_size;
219   if (total_count == 0) {
220     return;
221   }
222   static constexpr int32 kWarpSize = 32;
223   GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
224 
225   const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
226   int32 shared_memory_size = 0;
227   if (data_format == FORMAT_NHWC) {
228     shared_memory_size = bias_size * sizeof(typename AccumulatorType<T>::type);
229   }
230   // Check if we have enough shared memory.
231   if (shared_memory_size <= max_shared_memory_size) {
232     if (data_format == FORMAT_NHWC) {
233       TF_CHECK_OK(GpuLaunchKernel(BiasGradNHWC_SharedAtomics<T>,
234                                   config.block_count, config.thread_per_block,
235                                   shared_memory_size, d.stream(), total_count,
236                                   output_backprop, bias_backprop, bias_size));
237     } else {
238       // Round up the block count to multiple of bias_size.
239       int group_size = (config.block_count + bias_size - 1) / bias_size;
240       config.block_count = group_size * bias_size;
241       if (config.thread_per_block < kWarpSize) {
242         config.thread_per_block = kWarpSize;
243       }
244       TF_CHECK_OK(GpuLaunchKernel(BiasGradNCHW_SharedAtomics<T>,
245                                   config.block_count, config.thread_per_block,
246                                   0, d.stream(), output_backprop, bias_backprop,
247                                   batch, bias_size, image_size, group_size));
248     }
249   } else {
250     // Note that even if we don't have enough shared memory to fit the entire
251     // output block, it is possible to process one group of elements at a time.
252     // But for now, we simply fall back to the naive implementation.
253     if (data_format == FORMAT_NHWC) {
254       TF_CHECK_OK(GpuLaunchKernel(
255           BiasGradNHWC_Naive<T>, config.block_count, config.thread_per_block, 0,
256           d.stream(), total_count, output_backprop, bias_backprop, bias_size));
257     } else {
258       TF_CHECK_OK(GpuLaunchKernel(BiasGradNCHW_Naive<T>, config.block_count,
259                                   config.thread_per_block, 0, d.stream(),
260                                   total_count, output_backprop, bias_backprop,
261                                   bias_size, image_size));
262     }
263   }
264 }
265 
266 template <typename T>
DoRowReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)267 void BiasGradGPU<T>::DoRowReduction(OpKernelContext* context, T* output,
268                                     const T* input, int rows, int cols) {
269   typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
270   Constants<GPUDevice> constants;
271   gpuprim::Sum op;
272   functor::ReduceImpl<T, gpuprim::Sum, T*, const T*, ReductionAxes>(
273       context, output, input, 2, rows, cols, 1, 1, constants.kOne, op);
274 }
275 
276 template <typename T>
DoColReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)277 void BiasGradGPU<T>::DoColReduction(OpKernelContext* context, T* output,
278                                     const T* input, int rows, int cols) {
279   typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
280   Constants<GPUDevice> constants;
281   gpuprim::Sum op;
282   functor::ReduceImpl<T, gpuprim::Sum, T*, const T*, ReductionAxes>(
283       context, output, input, 2, rows, cols, 1, 1, constants.kZero, op);
284 }
285 
286 #define DEFINE_GPU_SPECS(T)   \
287   template struct BiasGPU<T>; \
288   template struct BiasGradGPU<T>;
289 
290 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
291 
292 // No BiasGrad kernel for int32.
293 template struct BiasGPU<int32>;
294 
295 }  // end namespace tensorflow
296 
297 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
298