1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17
18 #define EIGEN_USE_GPU
19
20 #include <stdio.h>
21
22 #include "third_party/eigen3/Eigen/Core"
23 #include "tensorflow/core/framework/register_types.h"
24 #include "tensorflow/core/framework/tensor_types.h"
25 #include "tensorflow/core/kernels/relu_op_functor.h"
26 #include "tensorflow/core/util/gpu_kernel_helper.h"
27 #include "tensorflow/core/util/gpu_launch_config.h"
28
29 #if TENSORFLOW_USE_ROCM
30 #include "rocm/include/hip/hip_fp16.h"
31 typedef __half2 half2;
32 #endif
33
34 namespace tensorflow {
35
36 typedef Eigen::GpuDevice GPUDevice;
37
38 static constexpr int VectorSizeElements = 8;
39 namespace functor {
40
41 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
42 // It effectively does: backdrops = (feature > 0) ? gradient : 0
43 // It also tries to use native half2 primitives as much as possible.
ReluGradHalfKernel(const Eigen::half * __restrict__ gradient,const Eigen::half * __restrict__ feature,Eigen::half * __restrict__ backprop,int32 count)44 __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
45 const Eigen::half* __restrict__ feature,
46 Eigen::half* __restrict__ backprop,
47 int32 count) {
48 int32 half2_count = count >> 1;
49 int32 index = blockIdx.x * blockDim.x + threadIdx.x;
50 const int32 total_device_threads = gridDim.x * blockDim.x;
51
52 while (index < half2_count) {
53 // The fast branch.
54 // One half2, two fp16, is fetched and processed at a time.
55 half2 gradient_h2 = reinterpret_cast<const half2*>(gradient)[index];
56 half2 feature_h2 = reinterpret_cast<const half2*>(feature)[index];
57 half2* p_backprop_h2 = reinterpret_cast<half2*>(backprop) + index;
58
59 #if __CUDA_ARCH__ >= 530
60 // Fast path, when half2 primitives are available.
61 const half2 kZeroH2 = __float2half2_rn(0.f);
62 // mask = (feature > 0)
63 half2 mask_h2 = __hgt2(feature_h2, kZeroH2);
64 // backprop = mask * gradient
65 half2 backprop_h2 = __hmul2(mask_h2, gradient_h2);
66 #else
67 // Fall back: convert half2 to float2 for processing.
68 float2 feature_f2 = __half22float2(feature_h2);
69 float2 gradient_f2 = __half22float2(gradient_h2);
70 float2 backprop_f2 =
71 make_float2((feature_f2.x > 0.0f) ? float(gradient_f2.x) : 0.0f,
72 (feature_f2.y > 0.0f) ? float(gradient_f2.y) : 0.0f);
73 // Convert back to half2.
74 half2 backprop_h2 = __float22half2_rn(backprop_f2);
75 #endif
76
77 // Write back the result.
78 *p_backprop_h2 = backprop_h2;
79
80 index += total_device_threads;
81 }
82
83 if ((count & 0x1) == 1 && index == half2_count) {
84 // If the total number of the elements is odd, process the last element.
85 Eigen::half grad_h = gradient[count - 1];
86 Eigen::half feature_h = feature[count - 1];
87
88 float grad_f = static_cast<float>(grad_h);
89 float feature_f = static_cast<float>(feature_h);
90 float backprop_f = (feature_f > 0) ? grad_f : 0;
91
92 Eigen::half backprop_h(backprop_f);
93 backprop[count - 1] = backprop_h;
94 }
95 }
96
ReluGradHalfKernelVector(const Eigen::half * __restrict__ gradient,const Eigen::half * __restrict__ feature,Eigen::half * __restrict__ backprop,int32 count)97 __global__ void ReluGradHalfKernelVector(
98 const Eigen::half* __restrict__ gradient,
99 const Eigen::half* __restrict__ feature, Eigen::half* __restrict__ backprop,
100 int32 count) {
101 int32 half8_count = count / VectorSizeElements;
102 int32 index = blockIdx.x * blockDim.x + threadIdx.x;
103
104 if (index < half8_count) {
105 // Cast to xx_h8 for vector load and store.
106 float4 gradient_h8 = reinterpret_cast<const float4*>(gradient)[index];
107 float4 feature_h8 = reinterpret_cast<const float4*>(feature)[index];
108 float4* p_backprop_h8 = reinterpret_cast<float4*>(backprop) + index;
109
110 half2* gradient_h2 = reinterpret_cast<half2*>(&gradient_h8);
111 half2* feature_h2 = reinterpret_cast<half2*>(&feature_h8);
112 float4 backprop_h8;
113 half2* p_backprop_h2 = reinterpret_cast<half2*>(&backprop_h8);
114
115 // Fast path, when half2 primitives are available.
116 #if __CUDA_ARCH__ >= 530
117 const half2 kZeroH2 = __float2half2_rn(0.f);
118 #endif
119 for (int i = 0; i < VectorSizeElements / 2; i++) {
120 #if __CUDA_ARCH__ >= 530
121 // mask = (feature > 0)
122 half2 mask_h2 = __hgt2(feature_h2[i], kZeroH2);
123 // backprop = mask * gradient
124 half2 backprop_h2 = __hmul2(mask_h2, gradient_h2[i]);
125 #else
126 // Fall back: convert half2 to float2 for processing.
127 float2 feature_f2 = __half22float2(feature_h2[i]);
128 float2 gradient_f2 = __half22float2(gradient_h2[i]);
129 float2 backprop_f2 =
130 make_float2((feature_f2.x > 0.0f) ? float(gradient_f2.x) : 0.0f,
131 (feature_f2.y > 0.0f) ? float(gradient_f2.y) : 0.0f);
132 // Convert back to half2.
133 half2 backprop_h2 = __float22half2_rn(backprop_f2);
134 #endif
135 p_backprop_h2[i] = backprop_h2;
136 }
137 // Write back the result.
138 *p_backprop_h8 = backprop_h8;
139 }
140
141 int remaining_count = (count % VectorSizeElements);
142
143 if (index < remaining_count) {
144 // Use first threads to process the remaining elements.
145 Eigen::half grad_h = gradient[half8_count * VectorSizeElements + index];
146 Eigen::half feature_h = feature[half8_count * VectorSizeElements + index];
147
148 float grad_f = static_cast<float>(grad_h);
149 float feature_f = static_cast<float>(feature_h);
150 float backprop_f = (feature_f > 0) ? grad_f : 0;
151
152 Eigen::half backprop_h(backprop_f);
153 backprop[half8_count * VectorSizeElements + index] = backprop_h;
154 }
155 }
156
157 template <typename Device>
158 struct ReluGrad<Device, Eigen::half> {
159 // Computes ReluGrad backprop.
160 //
161 // gradient: gradient backpropagated to the Relu op.
162 // feature: either the inputs that were passed to the Relu, or its outputs
163 // (using either one yields the same result here).
164 // backprop: gradient to backpropagate to the Relu inputs.
operator ()tensorflow::functor::ReluGrad165 void operator()(const Device& d,
166 typename TTypes<Eigen::half>::ConstTensor gradient,
167 typename TTypes<Eigen::half>::ConstTensor feature,
168 typename TTypes<Eigen::half>::Tensor backprop) {
169 // NOTE: When the activation is exactly zero, we do not propagate the
170 // associated gradient value. This allows the output of the Relu to be used,
171 // as well as its input.
172 auto gradient_ptr = reinterpret_cast<uintptr_t>(gradient.data());
173 auto feature_ptr = reinterpret_cast<uintptr_t>(feature.data());
174 auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
175 bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
176 backprop_ptr % 16 == 0;
177 int32 count = gradient.size();
178 constexpr int32 kThreadInBlock = 512;
179 if (count == 0) return;
180 if (aligned) {
181 int32 half8_count = Eigen::divup(count, VectorSizeElements);
182 int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
183 TF_CHECK_OK(GpuLaunchKernel(
184 ReluGradHalfKernelVector, kBlock, kThreadInBlock, 0, d.stream(),
185 gradient.data(), feature.data(), backprop.data(), count));
186 } else {
187 int32 half2_count = Eigen::divup(count, 2);
188 GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
189 half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
190 TF_CHECK_OK(GpuLaunchKernel(
191 ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
192 d.stream(), gradient.data(), feature.data(), backprop.data(), count));
193 }
194 }
195 };
196
Relu_int8x4_kernel(int vect_count,const int32 * __restrict__ input,int32 * __restrict__ output)197 __global__ void Relu_int8x4_kernel(int vect_count,
198 const int32* __restrict__ input,
199 int32* __restrict__ output) {
200 CUDA_1D_KERNEL_LOOP(index, vect_count) {
201 #if GOOGLE_CUDA
202 output[index] = __vmaxs4(input[index], 0);
203 #else
204 uint32 signs = (~input[index]) & 0x80808080;
205 signs = signs >> 7;
206 signs |= signs << 1;
207 signs |= signs << 2;
208 signs |= signs << 4;
209 signs &= 0x7f7f7f7f;
210 output[index] = input[index] & signs;
211 #endif
212 }
213 }
214
215 // Functor used by ReluOp to do the computations.
216 template <typename Device>
217 struct Relu<Device, qint8> {
218 // Computes Relu activation of 'input' containing int8 elements, whose buffer
219 // size should be a multiple of 4, and aligned to an int32* boundary.
220 // (Alignment should be guaranteed by the GPU tensor allocator).
221 // 'output' should have the same size as 'input'.
operator ()tensorflow::functor::Relu222 void operator()(const Device& d, typename TTypes<qint8>::ConstTensor input,
223 typename TTypes<qint8>::Tensor output) {
224 int32 count = input.size();
225 if (count == 0) return;
226
227 int32 vect_count = Eigen::divup(count, 4);
228 constexpr int32 kThreadInBlock = 512;
229 GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
230 vect_count, d, Relu_int8x4_kernel, 0, kThreadInBlock);
231 TF_CHECK_OK(GpuLaunchKernel(
232 Relu_int8x4_kernel, config.block_count, config.thread_per_block, 0,
233 d.stream(), vect_count, reinterpret_cast<const int32*>(input.data()),
234 reinterpret_cast<int32*>(output.data())));
235 }
236 };
237
238 } // namespace functor
239
240 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
241 #define DEFINE_GPU_NO_MLIR_KERNELS(T) \
242 template struct functor::Relu<GPUDevice, T>; \
243 template struct functor::Elu<GPUDevice, T>; \
244 template struct functor::Selu<GPUDevice, T>;
245
246 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_NO_MLIR_KERNELS);
247
248 #undef DEFINE_RELU_KERNELS
249 #endif
250
251 // Definition of the GPU implementations declared in relu_op.cc.
252 #define DEFINE_GPU_KERNELS(T) \
253 template struct functor::ReluGrad<GPUDevice, T>; \
254 template struct functor::Relu6<GPUDevice, T>; \
255 template struct functor::Relu6Grad<GPUDevice, T>; \
256 template struct functor::LeakyRelu<GPUDevice, T>; \
257 template struct functor::LeakyReluGrad<GPUDevice, T>; \
258 template struct functor::EluGrad<GPUDevice, T>; \
259 template struct functor::SeluGrad<GPUDevice, T>;
260
261 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
262 template struct functor::Relu<GPUDevice, qint8>;
263
264 } // end namespace tensorflow
265
266 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
267