1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17
18 #define EIGEN_USE_GPU
19
20 #include <algorithm>
21
22 #include "tensorflow/core/framework/register_types.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_shape.h"
25 #include "tensorflow/core/framework/tensor_types.h"
26 #include "tensorflow/core/kernels/bias_op.h"
27 #include "tensorflow/core/kernels/bias_op_gpu.h"
28 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
29 #include "tensorflow/core/kernels/reduction_ops_common.h"
30 #include "tensorflow/core/platform/types.h"
31 #include "tensorflow/core/util/gpu_kernel_helper.h"
32
33 namespace tensorflow {
34
35 typedef Eigen::GpuDevice GPUDevice;
36
37 // There are no native fp16 atomics (we simulate them using 32-bit atomics),
38 // so fp16 sums are done in fp32 internally. (We don't have a lot of shared
39 // memory traffic; BiasGradNCHW_SharedAtomics in particular works almost
40 // entirely on a local variable.)
41 template <class T>
42 struct AccumulatorType {
43 typedef T type;
44 };
45
46 template <>
47 struct AccumulatorType<Eigen::half> {
48 typedef float type;
49 };
50
51 // Definition of the GPU implementations declared in bias_op.cc.
52
53 template <typename T>
BiasNHWCKernel(int32 nthreads,const T * __restrict__ input,const T * __restrict__ bias,T * __restrict__ output,int32 bias_size)54 __global__ void BiasNHWCKernel(int32 nthreads, const T* __restrict__ input,
55 const T* __restrict__ bias,
56 T* __restrict__ output, int32 bias_size) {
57 GPU_1D_KERNEL_LOOP(index, nthreads) {
58 int32 bias_offset = index % bias_size;
59 output[index] = ldg(input + index) + ldg(bias + bias_offset);
60 }
61 }
62
63 template <typename T>
BiasNCHWKernel(int32 nthreads,const T * __restrict__ input,const T * __restrict__ bias,T * __restrict__ output,int32 bias_size,int32 image_size)64 __global__ void BiasNCHWKernel(int32 nthreads, const T* __restrict__ input,
65 const T* __restrict__ bias,
66 T* __restrict__ output, int32 bias_size,
67 int32 image_size) {
68 GPU_1D_KERNEL_LOOP(index, nthreads) {
69 int32 index2 = index / image_size;
70 int32 bias_offset = index2 % bias_size;
71 output[index] = ldg(input + index) + ldg(bias + bias_offset);
72 }
73 }
74
75 // Add "bias" to "input", broadcasting it on all dimensions but the bias
76 // dimension.
77 template <typename T>
compute(const GPUDevice & d,const T * input,const T * bias,T * output,int32 batch,int32 height,int32 width,int depth,int32 channel,TensorFormat data_format)78 void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
79 T* output, int32 batch, int32 height, int32 width,
80 int depth, int32 channel, TensorFormat data_format) {
81 const int32 bias_size = channel;
82 const int32 image_size = height * width * depth;
83 const int32 total_count = batch * bias_size * image_size;
84 if (total_count == 0) {
85 return;
86 }
87 GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
88 if (data_format == FORMAT_NHWC) {
89 TF_CHECK_OK(GpuLaunchKernel(BiasNHWCKernel<T>, config.block_count,
90 config.thread_per_block, 0, d.stream(),
91 config.virtual_thread_count, input, bias,
92 output, bias_size));
93 } else {
94 TF_CHECK_OK(GpuLaunchKernel(BiasNCHWKernel<T>, config.block_count,
95 config.thread_per_block, 0, d.stream(),
96 config.virtual_thread_count, input, bias,
97 output, bias_size, image_size));
98 }
99 }
100
101 // A naive implementation that is functional on all cases.
102 template <typename T>
BiasGradNHWC_Naive(int32 nthreads,const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 bias_size)103 __global__ void BiasGradNHWC_Naive(int32 nthreads,
104 const T* __restrict__ output_backprop,
105 T* __restrict__ bias_backprop,
106 int32 bias_size) {
107 GPU_1D_KERNEL_LOOP(index, nthreads) {
108 int32 bias_offset = index % bias_size;
109 GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
110 }
111 }
112
113 // A naive implementation that is functional on all cases.
114 template <typename T>
BiasGradNCHW_Naive(int32 nthreads,const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 bias_size,int32 image_size)115 __global__ void BiasGradNCHW_Naive(int32 nthreads,
116 const T* __restrict__ output_backprop,
117 T* __restrict__ bias_backprop,
118 int32 bias_size, int32 image_size) {
119 GPU_1D_KERNEL_LOOP(index, nthreads) {
120 int32 index2 = index / image_size;
121 int32 bias_offset = index2 % bias_size;
122 GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
123 }
124 }
125
126 template <typename T>
BiasGradNHWC_SharedAtomics(int32 nthreads,const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 bias_size)127 __global__ void BiasGradNHWC_SharedAtomics(
128 int32 nthreads, const T* __restrict__ output_backprop,
129 T* __restrict__ bias_backprop, int32 bias_size) {
130 typedef typename AccumulatorType<T>::type AccT;
131 GPU_DYNAMIC_SHARED_MEM_DECL(8, char, s_buf);
132 AccT* s_data = reinterpret_cast<AccT*>(s_buf);
133 for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
134 s_data[index] = AccT(0);
135 }
136 __syncthreads();
137
138 for (int32 index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
139 index += blockDim.x * gridDim.x) {
140 int32 bias_offset = index % bias_size;
141 GpuAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index)));
142 }
143 __syncthreads();
144
145 for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
146 GpuAtomicAdd(bias_backprop + index, T(s_data[index]));
147 }
148 }
149
150 template <typename T>
BiasGradNCHW_SharedAtomics(const T * __restrict__ output_backprop,T * __restrict__ bias_backprop,int32 batch,int32 bias_size,int32 image_size,int group_size)151 __global__ void BiasGradNCHW_SharedAtomics(
152 const T* __restrict__ output_backprop, T* __restrict__ bias_backprop,
153 int32 batch, int32 bias_size, int32 image_size, int group_size) {
154 // Initialize the shared memory.
155 typedef typename AccumulatorType<T>::type AccT;
156 const int32 kSDataSize = 32;
157 __shared__ AccT s_data[kSDataSize];
158 for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
159 s_data[index] = AccT(0);
160 }
161 __syncthreads();
162
163 // Accumulate all the values within this thread. They all have the same bias
164 // index.
165 int32 bias_index = blockIdx.x % bias_size;
166 int32 group_index = blockIdx.x / bias_size;
167 int32 total_count = batch * image_size;
168 AccT sum(0);
169 for (int32 index = group_index * blockDim.x + threadIdx.x;
170 index < total_count; index += blockDim.x * group_size) {
171 int32 image_offset = index % image_size;
172 int32 batch = index / image_size;
173 T val = ldg(output_backprop +
174 (batch * bias_size + bias_index) * image_size + image_offset);
175 sum += AccT(val);
176 }
177
178 // Write the accumulated sum in this thread to the shared memory. Each thread
179 // shifts their write location to avoid bank conflict.
180 int bias_offset = threadIdx.x % 32;
181 GpuAtomicAdd(s_data + bias_offset, sum);
182 __syncthreads();
183
184 // Accumulate the results in the shared memory into the first element.
185 // No syncthreads is needed since this is only in the same warp.
186 int32 thread_index = threadIdx.x;
187 #if GOOGLE_CUDA
188 if (thread_index < 32) {
189 AccT data = s_data[thread_index];
190 for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
191 data += GpuShuffleXorSync(kCudaWarpAll, data, delta);
192 }
193 if (thread_index == 0) {
194 GpuAtomicAdd(bias_backprop + bias_index, T(data));
195 }
196 }
197 #elif TENSORFLOW_USE_ROCM
198 if (thread_index < 16) s_data[thread_index] += s_data[thread_index + 16];
199 if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
200 if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
201 if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
202 if (thread_index < 1) s_data[thread_index] += s_data[thread_index + 1];
203
204 // The first thread writes out the accumulated result to the global location.
205 if (thread_index == 0) {
206 GpuAtomicAdd(bias_backprop + bias_index, T(s_data[0]));
207 }
208 #endif
209 }
210
211 template <typename T>
compute(const GPUDevice & d,const T * output_backprop,T * bias_backprop,int32 batch,int32 height,int32 width,int32 depth,int32 channel,TensorFormat data_format)212 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
213 T* bias_backprop, int32 batch, int32 height,
214 int32 width, int32 depth, int32 channel,
215 TensorFormat data_format) {
216 const int32 bias_size = channel;
217 const int32 image_size = height * width * depth;
218 const int32 total_count = batch * bias_size * image_size;
219 if (total_count == 0) {
220 return;
221 }
222 static constexpr int32 kWarpSize = 32;
223 GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
224
225 const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
226 int32 shared_memory_size = 0;
227 if (data_format == FORMAT_NHWC) {
228 shared_memory_size = bias_size * sizeof(typename AccumulatorType<T>::type);
229 }
230 // Check if we have enough shared memory.
231 if (shared_memory_size <= max_shared_memory_size) {
232 if (data_format == FORMAT_NHWC) {
233 TF_CHECK_OK(GpuLaunchKernel(BiasGradNHWC_SharedAtomics<T>,
234 config.block_count, config.thread_per_block,
235 shared_memory_size, d.stream(), total_count,
236 output_backprop, bias_backprop, bias_size));
237 } else {
238 // Round up the block count to multiple of bias_size.
239 int group_size = (config.block_count + bias_size - 1) / bias_size;
240 config.block_count = group_size * bias_size;
241 if (config.thread_per_block < kWarpSize) {
242 config.thread_per_block = kWarpSize;
243 }
244 TF_CHECK_OK(GpuLaunchKernel(BiasGradNCHW_SharedAtomics<T>,
245 config.block_count, config.thread_per_block,
246 0, d.stream(), output_backprop, bias_backprop,
247 batch, bias_size, image_size, group_size));
248 }
249 } else {
250 // Note that even if we don't have enough shared memory to fit the entire
251 // output block, it is possible to process one group of elements at a time.
252 // But for now, we simply fall back to the naive implementation.
253 if (data_format == FORMAT_NHWC) {
254 TF_CHECK_OK(GpuLaunchKernel(
255 BiasGradNHWC_Naive<T>, config.block_count, config.thread_per_block, 0,
256 d.stream(), total_count, output_backprop, bias_backprop, bias_size));
257 } else {
258 TF_CHECK_OK(GpuLaunchKernel(BiasGradNCHW_Naive<T>, config.block_count,
259 config.thread_per_block, 0, d.stream(),
260 total_count, output_backprop, bias_backprop,
261 bias_size, image_size));
262 }
263 }
264 }
265
266 template <typename T>
DoRowReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)267 void BiasGradGPU<T>::DoRowReduction(OpKernelContext* context, T* output,
268 const T* input, int rows, int cols) {
269 typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
270 Constants<GPUDevice> constants;
271 gpuprim::Sum op;
272 functor::ReduceImpl<T, gpuprim::Sum, T*, const T*, ReductionAxes>(
273 context, output, input, 2, rows, cols, 1, 1, constants.kOne, op);
274 }
275
276 template <typename T>
DoColReduction(OpKernelContext * context,T * output,const T * input,int rows,int cols)277 void BiasGradGPU<T>::DoColReduction(OpKernelContext* context, T* output,
278 const T* input, int rows, int cols) {
279 typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
280 Constants<GPUDevice> constants;
281 gpuprim::Sum op;
282 functor::ReduceImpl<T, gpuprim::Sum, T*, const T*, ReductionAxes>(
283 context, output, input, 2, rows, cols, 1, 1, constants.kZero, op);
284 }
285
286 #define DEFINE_GPU_SPECS(T) \
287 template struct BiasGPU<T>; \
288 template struct BiasGradGPU<T>;
289
290 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
291
292 // No BiasGrad kernel for int32.
293 template struct BiasGPU<int32>;
294
295 } // end namespace tensorflow
296
297 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
298