1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA
17
18 #define EIGEN_USE_GPU
19
20 #include <algorithm>
21
22 #include "tensorflow/core/framework/register_types.h"
23 #include "tensorflow/core/kernels/bias_op.h"
24 #include "tensorflow/core/kernels/bias_op_gpu.h"
25 #include "tensorflow/core/util/cuda_kernel_helper.h"
26
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/framework/tensor_shape.h"
29 #include "tensorflow/core/framework/tensor_types.h"
30 #include "tensorflow/core/platform/types.h"
31
32 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
33 #include "tensorflow/core/kernels/reduction_ops_common.h"
34
35 namespace tensorflow {
36
37 typedef Eigen::GpuDevice GPUDevice;
38
39 // There are no native fp16 atomics (we simulate them using 32-bit atomics),
40 // so fp16 sums are done in fp32 internally. (We don't have a lot of shared
41 // memory traffic; BiasGradNCHW_SharedAtomics in particular works almost
42 // entirely on a local variable.)
43 template <class T>
44 struct AccumulatorType {
45 typedef T type;
46 };
47
48 template <>
49 struct AccumulatorType<Eigen::half> {
50 typedef float type;
51 };
52
53 // Definition of the GPU implementations declared in bias_op.cc.
54
55 template <typename T>
BiasNHWCKernel(int32 nthreads,const T * input,const T * bias,T * output,int32 bias_size)56 __global__ void BiasNHWCKernel(int32 nthreads, const T* input, const T* bias,
57 T* output, int32 bias_size) {
58 CUDA_1D_KERNEL_LOOP(index, nthreads) {
59 int32 bias_offset = index % bias_size;
60 output[index] = ldg(input + index) + ldg(bias + bias_offset);
61 }
62 }
63
64 template <typename T>
BiasNCHWKernel(int32 nthreads,const T * input,const T * bias,T * output,int32 bias_size,int32 image_size)65 __global__ void BiasNCHWKernel(int32 nthreads, const T* input, const T* bias,
66 T* output, int32 bias_size, int32 image_size) {
67 CUDA_1D_KERNEL_LOOP(index, nthreads) {
68 int32 index2 = index / image_size;
69 int32 bias_offset = index2 % bias_size;
70 output[index] = ldg(input + index) + ldg(bias + bias_offset);
71 }
72 }
73
74 // Add "bias" to "input", broadcasting it on all dimensions but the bias
75 // dimension.
76 template <typename T>
compute(const GPUDevice & d,const T * input,const T * bias,T * output,int32 batch,int32 height,int32 width,int depth,int32 channel,TensorFormat data_format)77 void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
78 T* output, int32 batch, int32 height, int32 width,
79 int depth, int32 channel, TensorFormat data_format) {
80 const int32 bias_size = channel;
81 const int32 image_size = height * width * depth;
82 const int32 total_count = batch * bias_size * image_size;
83 if (total_count == 0) {
84 return;
85 }
86 CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
87 if (data_format == FORMAT_NHWC) {
88 TF_CHECK_OK(CudaLaunchKernel(BiasNHWCKernel<T>, config.block_count,
89 config.thread_per_block, 0, d.stream(),
90 config.virtual_thread_count, input, bias,
91 output, bias_size));
92 } else {
93 TF_CHECK_OK(CudaLaunchKernel(BiasNCHWKernel<T>, config.block_count,
94 config.thread_per_block, 0, d.stream(),
95 config.virtual_thread_count, input, bias,
96 output, bias_size, image_size));
97 }
98 }
99
100 // A naive implementation that is functional on all cases.
101 template <typename T>
BiasGradNHWC_Naive(int32 nthreads,const T * output_backprop,T * bias_backprop,int32 bias_size)102 __global__ void BiasGradNHWC_Naive(int32 nthreads, const T* output_backprop,
103 T* bias_backprop, int32 bias_size) {
104 CUDA_1D_KERNEL_LOOP(index, nthreads) {
105 int32 bias_offset = index % bias_size;
106 CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
107 }
108 }
109
110 // A naive implementation that is functional on all cases.
111 template <typename T>
BiasGradNCHW_Naive(int32 nthreads,const T * output_backprop,T * bias_backprop,int32 bias_size,int32 image_size)112 __global__ void BiasGradNCHW_Naive(int32 nthreads, const T* output_backprop,
113 T* bias_backprop, int32 bias_size,
114 int32 image_size) {
115 CUDA_1D_KERNEL_LOOP(index, nthreads) {
116 int32 index2 = index / image_size;
117 int32 bias_offset = index2 % bias_size;
118 CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
119 }
120 }
121
122 extern __shared__ char s_buf[];
123
124 template <typename T>
BiasGradNHWC_SharedAtomics(int32 nthreads,const T * output_backprop,T * bias_backprop,int32 bias_size)125 __global__ void BiasGradNHWC_SharedAtomics(int32 nthreads,
126 const T* output_backprop,
127 T* bias_backprop, int32 bias_size) {
128 typedef typename AccumulatorType<T>::type AccT;
129 AccT* s_data = reinterpret_cast<AccT*>(s_buf);
130 for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
131 s_data[index] = AccT(0);
132 }
133 __syncthreads();
134
135 for (int32 index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
136 index += blockDim.x * gridDim.x) {
137 int32 bias_offset = index % bias_size;
138 CudaAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index)));
139 }
140 __syncthreads();
141
142 for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
143 CudaAtomicAdd(bias_backprop + index, T(s_data[index]));
144 }
145 }
146
147 template <typename T>
BiasGradNCHW_SharedAtomics(const T * output_backprop,T * bias_backprop,int32 batch,int32 bias_size,int32 image_size,int group_size)148 __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
149 T* bias_backprop, int32 batch,
150 int32 bias_size, int32 image_size,
151 int group_size) {
152 // Initialize the shared memory.
153 typedef typename AccumulatorType<T>::type AccT;
154 const int32 kSDataSize = 32;
155 __shared__ AccT s_data[kSDataSize];
156 for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
157 s_data[index] = AccT(0);
158 }
159 __syncthreads();
160
161 // Accumulate all the values within this thread. They all have the same bias
162 // index.
163 int32 bias_index = blockIdx.x % bias_size;
164 int32 group_index = blockIdx.x / bias_size;
165 int32 total_count = batch * image_size;
166 AccT sum(0);
167 for (int32 index = group_index * blockDim.x + threadIdx.x;
168 index < total_count; index += blockDim.x * group_size) {
169 int32 image_offset = index % image_size;
170 int32 batch = index / image_size;
171 T val = ldg(output_backprop +
172 (batch * bias_size + bias_index) * image_size + image_offset);
173 sum += AccT(val);
174 }
175
176 // Write the accumulated sum in this thread to the shared memory. Each thread
177 // shifts their write location to avoid bank conflict.
178 int bias_offset = threadIdx.x % 32;
179 CudaAtomicAdd(s_data + bias_offset, sum);
180 __syncthreads();
181
182 // Accumulate the results in the shared memory into the first element.
183 // No syncthreads is needed since this is only in the same warp.
184 int32 thread_index = threadIdx.x;
185 if (thread_index < 32) {
186 AccT data = s_data[thread_index];
187 for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
188 data += CudaShuffleXorSync(kCudaWarpAll, data, delta);
189 }
190 if (thread_index == 0) {
191 CudaAtomicAdd(bias_backprop + bias_index, T(data));
192 }
193 }
194 }
195
196 template <typename T>
compute(const GPUDevice & d,const T * output_backprop,T * bias_backprop,int32 batch,int32 height,int32 width,int32 depth,int32 channel,TensorFormat data_format)197 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
198 T* bias_backprop, int32 batch, int32 height,
199 int32 width, int32 depth, int32 channel,
200 TensorFormat data_format) {
201 const int32 bias_size = channel;
202 const int32 image_size = height * width * depth;
203 const int32 total_count = batch * bias_size * image_size;
204 if (total_count == 0) {
205 return;
206 }
207 static constexpr int32 kWarpSize = 32;
208 CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
209
210 const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
211 int32 shared_memory_size = 0;
212 if (data_format == FORMAT_NHWC) {
213 shared_memory_size = bias_size * sizeof(typename AccumulatorType<T>::type);
214 }
215 // Check if we have enough shared memory.
216 if (shared_memory_size <= max_shared_memory_size) {
217 if (data_format == FORMAT_NHWC) {
218 BiasGradNHWC_SharedAtomics<T>
219 <<<config.block_count, config.thread_per_block, shared_memory_size,
220 d.stream()>>>(total_count, output_backprop, bias_backprop,
221 bias_size);
222 } else {
223 // Round up the block count to multiple of bias_size.
224 int group_size = (config.block_count + bias_size - 1) / bias_size;
225 config.block_count = group_size * bias_size;
226 if (config.thread_per_block < kWarpSize) {
227 config.thread_per_block = kWarpSize;
228 }
229 TF_CHECK_OK(CudaLaunchKernel(
230 BiasGradNCHW_SharedAtomics<T>, config.block_count,
231 config.thread_per_block, 0, d.stream(), output_backprop,
232 bias_backprop, batch, bias_size, image_size, group_size));
233 }
234 } else {
235 // Note that even if we don't have enough shared memory to fit the entire
236 // output block, it is possible to process one group of elements at a time.
237 // But for now, we simply fall back to the naive implementation.
238 if (data_format == FORMAT_NHWC) {
239 TF_CHECK_OK(CudaLaunchKernel(
240 BiasGradNHWC_Naive<T>, config.block_count, config.thread_per_block, 0,
241 d.stream(), total_count, output_backprop, bias_backprop, bias_size));
242 } else {
243 TF_CHECK_OK(CudaLaunchKernel(BiasGradNCHW_Naive<T>, config.block_count,
244 config.thread_per_block, 0, d.stream(),
245 total_count, output_backprop, bias_backprop,
246 bias_size, image_size));
247 }
248 }
249 }
250
251 template <typename T>
DoRowReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)252 void BiasGradGPU<T>::DoRowReduction(OpKernelContext* context, T* output,
253 const T* input, int rows, int cols) {
254 typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
255 Constants<GPUDevice> constants;
256 cub::Sum op;
257 functor::ReduceImpl<T, cub::Sum, T*, const T*, ReductionAxes>(
258 context, output, input, 2, rows, cols, 1, 1, constants.kOne, op);
259 }
260
261 template <typename T>
DoColReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)262 void BiasGradGPU<T>::DoColReduction(OpKernelContext* context, T* output,
263 const T* input, int rows, int cols) {
264 typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
265 Constants<GPUDevice> constants;
266 cub::Sum op;
267 functor::ReduceImpl<T, cub::Sum, T*, const T*, ReductionAxes>(
268 context, output, input, 2, rows, cols, 1, 1, constants.kZero, op);
269 }
270
271 #define DEFINE_GPU_SPECS(T) \
272 template struct BiasGPU<T>; \
273 template struct BiasGradGPU<T>;
274
275 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
276
277 } // end namespace tensorflow
278
279 #endif // GOOGLE_CUDA
280