1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA
17
18 #define EIGEN_USE_GPU
19
20 #include <stdio.h>
21
22 #include "tensorflow/core/kernels/split_lib.h"
23
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor_types.h"
26 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
27 #include "tensorflow/core/kernels/split_lib_gpu.h"
28 #include "tensorflow/core/util/cuda_kernel_helper.h"
29
30 namespace tensorflow {
31 namespace functor {
32
33 template <typename Device, typename T, int NDims>
operator ()(const Device & d,typename TTypes<T,NDims>::Tensor output,typename TTypes<T,NDims>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_sizes)34 void Split<Device, T, NDims>::operator()(
35 const Device& d, typename TTypes<T, NDims>::Tensor output,
36 typename TTypes<T, NDims>::ConstTensor input,
37 const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
38 const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
39 To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
40 }
41
42 template <typename Device, typename T>
operator ()(const Device & d,typename TTypes<T,2>::Tensor output,typename TTypes<T,2>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_sizes)43 void SplitCustom<Device, T>::operator()(
44 const Device& d, typename TTypes<T, 2>::Tensor output,
45 typename TTypes<T, 2>::ConstTensor input,
46 const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_indices,
47 const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_sizes) {
48 To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
49 }
50
51 #define DEFINE_GPU_KERNELS(T) \
52 template struct Split<Eigen::GpuDevice, T, 2>; \
53 template struct Split<Eigen::GpuDevice, T, 3>;
54
55 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
56 TF_CALL_complex64(DEFINE_GPU_KERNELS);
57 TF_CALL_complex128(DEFINE_GPU_KERNELS);
58 TF_CALL_int64(DEFINE_GPU_KERNELS);
59 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
60 TF_CALL_uint8(DEFINE_GPU_KERNELS);
61 TF_CALL_bool(DEFINE_GPU_KERNELS);
62
63 #undef DEFINE_GPU_KERNELS
64 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
65
66 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
67 TF_CALL_complex64(DEFINE_GPU_KERNELS);
68 TF_CALL_complex128(DEFINE_GPU_KERNELS);
69 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
70
71 #undef DEFINE_GPU_KERNELS
72
73 } // namespace functor
74
75 namespace {
76
77 template <typename T>
SplitOpKernel(const T * input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)78 __global__ void SplitOpKernel(const T* input, int32 prefix_dim_size,
79 int32 split_dim_size, int32 suffix_dim_size,
80 GpuDeviceArrayStruct<T*> output_ptr_data) {
81 const int32 num_split = output_ptr_data.size;
82 T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
83
84 eigen_assert(blockDim.y == 1);
85 eigen_assert(blockDim.z == 1);
86 eigen_assert(split_dim_size % num_split == 0);
87
88 int32 size = prefix_dim_size * split_dim_size * suffix_dim_size;
89 int32 piece_size = split_dim_size / num_split;
90
91 CUDA_1D_KERNEL_LOOP(offset, size) {
92 // Calculate the index into input from offset.
93 int32 i = offset / (split_dim_size * suffix_dim_size);
94 int32 j = (offset % (split_dim_size * suffix_dim_size)) / suffix_dim_size;
95 int32 k = offset % suffix_dim_size;
96
97 // Find the output buffer that should be written to.
98 T* output_ptr = output_ptrs[j / piece_size];
99 // output_ptr is pointing to an array of size
100 // [prefix_dim_size][piece_size][suffix_dim_size].
101 //
102 // output_ptr[i][j % piece_size][k] = input[offset];
103 // Linearize (i, j % piece_size, k) into an offset.
104 int32 output_offset = i * piece_size * suffix_dim_size +
105 (j % piece_size) * suffix_dim_size + k;
106 *(output_ptr + output_offset) = ldg(input + offset);
107 }
108 }
109
110 } // namespace
111
112 // cannot be in anonymous namespace due to extern shared memory
113 // very similar to the concat kernel except the input/output logic
114 // is reversed
115 template <typename T, typename IntType, bool useSmem>
split_v_kernel(const T * input_ptr,GpuDeviceArrayStruct<IntType> output_scan,IntType total_rows,IntType total_cols,GpuDeviceArrayStruct<T * > output_ptr_data)116 __global__ void split_v_kernel(const T* input_ptr,
117 GpuDeviceArrayStruct<IntType> output_scan,
118 IntType total_rows, IntType total_cols,
119 GpuDeviceArrayStruct<T*> output_ptr_data) {
120 T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
121 IntType* col_scan = GetGpuDeviceArrayOnDevice(&output_scan);
122
123 // do upper_bound on col to find which pointer we should be using
124 IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
125 int num_outputs = output_ptr_data.size;
126
127 // verbose declaration needed due to template
128 extern __shared__ __align__(sizeof(T)) unsigned char smem[];
129 IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
130
131 if (useSmem) {
132 IntType lidx = threadIdx.y * blockDim.x + threadIdx.x;
133 IntType blockSize = blockDim.x * blockDim.y;
134
135 for (IntType i = lidx; i < output_scan.size; i += blockSize) {
136 smem_col_scan[i] = col_scan[i];
137 }
138
139 __syncthreads();
140
141 col_scan = smem_col_scan;
142 }
143
144 // do an initial binary search and then scan linearly from there
145 // works well when there are many small segments and when the
146 // segments are much longer
147 IntType segment =
148 cuda_helper::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
149
150 IntType curr_offset = col_scan[segment];
151 IntType curr_segment = segment;
152 for (; gidx < total_cols; gidx += blockDim.x * gridDim.x) {
153 IntType curr_col_offset;
154 while ((curr_col_offset = col_scan[curr_segment + 1]) <= gidx) {
155 curr_offset = curr_col_offset;
156 ++curr_segment;
157 }
158
159 IntType local_col = gidx - curr_offset;
160 IntType segment_width = curr_col_offset - curr_offset;
161 T* output_ptr = output_ptrs[curr_segment];
162
163 IntType gidy = blockIdx.y * blockDim.y + threadIdx.y;
164 for (; gidy < total_rows; gidy += blockDim.y * gridDim.y)
165 output_ptr[gidy * segment_width + local_col] =
166 input_ptr[gidy * total_cols + gidx];
167 }
168 }
169
170 // different from the original split implementation due to 2D vs 3D
171 // dimensions. This version is likely faster due to less integer math.
172 template <typename T>
SplitVOpKernel_fixed(const T * input,int32 prefix_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)173 __global__ void SplitVOpKernel_fixed(const T* input, int32 prefix_dim_size,
174 int32 suffix_dim_size,
175 GpuDeviceArrayStruct<T*> output_ptr_data) {
176 const int32 num_split = output_ptr_data.size;
177 T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
178
179 eigen_assert(blockDim.y == 1);
180 eigen_assert(blockDim.z == 1);
181
182 int32 size = prefix_dim_size * suffix_dim_size;
183 int32 piece_size = suffix_dim_size / num_split;
184
185 CUDA_1D_KERNEL_LOOP(offset, size) {
186 // Calculate the index into input from offset.
187 int32 i = offset / suffix_dim_size;
188 int32 j = offset % suffix_dim_size;
189
190 // Find the output buffer that should be written to.
191 T* output_ptr = output_ptrs[j / piece_size];
192 int32 output_offset = i * piece_size + (j % piece_size);
193 output_ptr[output_offset] = input[offset];
194 }
195 }
196
197 template <typename T>
Run(const Eigen::GpuDevice & d,const T * input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,const GpuDeviceArrayStruct<T * > & output_ptr_data)198 void SplitOpGPULaunch<T>::Run(const Eigen::GpuDevice& d, const T* input,
199 int32 prefix_dim_size, int32 split_dim_size,
200 int32 suffix_dim_size,
201 const GpuDeviceArrayStruct<T*>& output_ptr_data) {
202 CudaLaunchConfig config = GetCudaLaunchConfig(
203 prefix_dim_size * split_dim_size * suffix_dim_size, d);
204
205 TF_CHECK_OK(CudaLaunchKernel(SplitOpKernel<T>, config.block_count,
206 config.thread_per_block, 0, d.stream(), input,
207 prefix_dim_size, split_dim_size, suffix_dim_size,
208 output_ptr_data));
209 }
210
211 template <typename T, typename IntType>
Run(const Eigen::GpuDevice & gpu_device,bool fixed_size,const T * input_ptr,int total_rows,int total_cols,const GpuDeviceArrayStruct<IntType> & output_scan,const GpuDeviceArrayStruct<T * > & output_ptr_data)212 void SplitVOpGPULaunch<T, IntType>::Run(
213 const Eigen::GpuDevice& gpu_device, bool fixed_size, const T* input_ptr,
214 int total_rows, int total_cols,
215 const GpuDeviceArrayStruct<IntType>& output_scan,
216 const GpuDeviceArrayStruct<T*>& output_ptr_data) {
217 if (fixed_size) {
218 CudaLaunchConfig config =
219 GetCudaLaunchConfig(total_rows * total_cols, gpu_device);
220
221 SplitVOpKernel_fixed<T><<<config.block_count, config.thread_per_block, 0,
222 gpu_device.stream()>>>(
223 input_ptr, total_rows, total_cols, output_ptr_data);
224 } else {
225 auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device);
226 IntType smem_max = gpu_device.sharedMemPerBlock();
227 IntType smem_usage = output_scan.size * sizeof(IntType);
228 // performance crossover is less than using maximum available shared
229 // memory on most processors possibly due to decreasing occupancy
230 // 4096 inputs is a lot, most code will take the smem path
231 const int32 kMaxSmemBytesPerformance = 16384;
232 if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
233 split_v_kernel<T, IntType, true>
234 <<<config.block_count, config.thread_per_block, smem_usage,
235 gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
236 total_cols, output_ptr_data);
237 else
238 split_v_kernel<T, IntType, false>
239 <<<config.block_count, config.thread_per_block, 0,
240 gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
241 total_cols, output_ptr_data);
242 }
243 }
244
245 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
246
247 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
248 TF_CALL_complex64(REGISTER_GPU_KERNEL);
249 TF_CALL_complex128(REGISTER_GPU_KERNEL);
250 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
251 #undef REGISTER_GPU_KERNEL
252 #define REGISTER_GPU_KERNEL(T) \
253 template struct SplitVOpGPULaunch<T, int32>; \
254 template struct SplitVOpGPULaunch<T, int64>;
255
256 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
257 TF_CALL_complex64(REGISTER_GPU_KERNEL);
258 TF_CALL_complex128(REGISTER_GPU_KERNEL);
259 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
260 #undef REGISTER_GPU_KERNEL
261
262 } // namespace tensorflow
263
264 #endif // GOOGLE_CUDA
265