1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17
18 #define EIGEN_USE_GPU
19
20 #include <stdio.h>
21
22 #include "tensorflow/core/framework/register_types.h"
23 #include "tensorflow/core/framework/tensor_types.h"
24 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
25 #include "tensorflow/core/kernels/split_lib.h"
26 #include "tensorflow/core/kernels/split_lib_gpu.h"
27 #include "tensorflow/core/util/gpu_kernel_helper.h"
28
29 namespace tensorflow {
30 namespace functor {
31
32 template <typename Device, typename T, int NDims>
operator ()(const Device & d,typename TTypes<T,NDims>::Tensor output,typename TTypes<T,NDims>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_sizes)33 void Split<Device, T, NDims>::operator()(
34 const Device& d, typename TTypes<T, NDims>::Tensor output,
35 typename TTypes<T, NDims>::ConstTensor input,
36 const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
37 const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
38 To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
39 }
40
41 template <typename Device, typename T>
operator ()(const Device & d,typename TTypes<T,2>::Tensor output,typename TTypes<T,2>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_sizes)42 void SplitCustom<Device, T>::operator()(
43 const Device& d, typename TTypes<T, 2>::Tensor output,
44 typename TTypes<T, 2>::ConstTensor input,
45 const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_indices,
46 const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_sizes) {
47 To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
48 }
49
50 #define DEFINE_GPU_KERNELS(T) \
51 template struct Split<Eigen::GpuDevice, T, 2>; \
52 template struct Split<Eigen::GpuDevice, T, 3>;
53
54 TF_CALL_int64(DEFINE_GPU_KERNELS);
55 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
56 TF_CALL_uint8(DEFINE_GPU_KERNELS);
57 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
58
59 #undef DEFINE_GPU_KERNELS
60 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
61
62 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
63 TF_CALL_COMPLEX_TYPES(DEFINE_GPU_KERNELS);
64 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
65
66 #undef DEFINE_GPU_KERNELS
67
68 } // namespace functor
69
70 namespace {
71
72 template <typename T>
SplitOpKernel(const T * __restrict__ input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)73 __global__ void SplitOpKernel(const T* __restrict__ input,
74 int32 prefix_dim_size, int32 split_dim_size,
75 int32 suffix_dim_size,
76 GpuDeviceArrayStruct<T*> output_ptr_data) {
77 const int32 num_split = output_ptr_data.size;
78 T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
79
80 eigen_assert(blockDim.y == 1);
81 eigen_assert(blockDim.z == 1);
82 eigen_assert(split_dim_size % num_split == 0);
83
84 int32 size = prefix_dim_size * split_dim_size * suffix_dim_size;
85 int32 piece_size = split_dim_size / num_split;
86
87 GPU_1D_KERNEL_LOOP(offset, size) {
88 // Calculate the index into input from offset.
89 int32 i = offset / (split_dim_size * suffix_dim_size);
90 int32 j = (offset % (split_dim_size * suffix_dim_size)) / suffix_dim_size;
91 int32 k = offset % suffix_dim_size;
92
93 // Find the output buffer that should be written to.
94 T* output_ptr = output_ptrs[j / piece_size];
95 // output_ptr is pointing to an array of size
96 // [prefix_dim_size][piece_size][suffix_dim_size].
97 //
98 // output_ptr[i][j % piece_size][k] = input[offset];
99 // Linearize (i, j % piece_size, k) into an offset.
100 int32 output_offset = i * piece_size * suffix_dim_size +
101 (j % piece_size) * suffix_dim_size + k;
102 *(output_ptr + output_offset) = ldg(input + offset);
103 }
104 }
105
106 } // namespace
107
108 // cannot be in anonymous namespace due to extern shared memory
109 // very similar to the concat kernel except the input/output logic
110 // is reversed
111 template <typename T, typename IntType, bool useSmem>
split_v_kernel(const T * __restrict__ input_ptr,GpuDeviceArrayStruct<IntType> output_scan,IntType total_rows,IntType total_cols,GpuDeviceArrayStruct<T * > output_ptr_data)112 __global__ void split_v_kernel(const T* __restrict__ input_ptr,
113 GpuDeviceArrayStruct<IntType> output_scan,
114 IntType total_rows, IntType total_cols,
115 GpuDeviceArrayStruct<T*> output_ptr_data) {
116 T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
117 IntType* col_scan = GetGpuDeviceArrayOnDevice(&output_scan);
118
119 // do upper_bound on col to find which pointer we should be using
120 IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
121 int num_outputs = output_ptr_data.size;
122
123 // verbose declaration needed due to template
124 GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(T), unsigned char, smem);
125 IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
126
127 if (useSmem) {
128 IntType lidx = threadIdx.y * blockDim.x + threadIdx.x;
129 IntType blockSize = blockDim.x * blockDim.y;
130
131 for (IntType i = lidx; i < output_scan.size; i += blockSize) {
132 smem_col_scan[i] = col_scan[i];
133 }
134
135 __syncthreads();
136
137 col_scan = smem_col_scan;
138 }
139
140 // do an initial binary search and then scan linearly from there
141 // works well when there are many small segments and when the
142 // segments are much longer
143 IntType segment =
144 gpu_helper::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
145
146 IntType curr_offset = col_scan[segment];
147 IntType curr_segment = segment;
148 for (; gidx < total_cols; gidx += blockDim.x * gridDim.x) {
149 IntType curr_col_offset;
150 while ((curr_col_offset = col_scan[curr_segment + 1]) <= gidx) {
151 curr_offset = curr_col_offset;
152 ++curr_segment;
153 }
154
155 IntType local_col = gidx - curr_offset;
156 IntType segment_width = curr_col_offset - curr_offset;
157 T* output_ptr = output_ptrs[curr_segment];
158
159 IntType gidy = blockIdx.y * blockDim.y + threadIdx.y;
160 for (; gidy < total_rows; gidy += blockDim.y * gridDim.y)
161 output_ptr[gidy * segment_width + local_col] =
162 input_ptr[gidy * total_cols + gidx];
163 }
164 }
165
166 // different from the original split implementation due to 2D vs 3D
167 // dimensions. This version is likely faster due to less integer math.
168 template <typename T>
SplitVOpKernel_fixed(const T * __restrict__ input,int32 prefix_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)169 __global__ void SplitVOpKernel_fixed(const T* __restrict__ input,
170 int32 prefix_dim_size,
171 int32 suffix_dim_size,
172 GpuDeviceArrayStruct<T*> output_ptr_data) {
173 const int32 num_split = output_ptr_data.size;
174 T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
175
176 eigen_assert(blockDim.y == 1);
177 eigen_assert(blockDim.z == 1);
178
179 int32 size = prefix_dim_size * suffix_dim_size;
180 int32 piece_size = suffix_dim_size / num_split;
181
182 GPU_1D_KERNEL_LOOP(offset, size) {
183 // Calculate the index into input from offset.
184 int32 i = offset / suffix_dim_size;
185 int32 j = offset % suffix_dim_size;
186
187 // Find the output buffer that should be written to.
188 T* output_ptr = output_ptrs[j / piece_size];
189 int32 output_offset = i * piece_size + (j % piece_size);
190 output_ptr[output_offset] = input[offset];
191 }
192 }
193
194 template <typename T>
Run(const Eigen::GpuDevice & d,const T * input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,const GpuDeviceArrayStruct<T * > & output_ptr_data)195 void SplitOpGPULaunch<T>::Run(const Eigen::GpuDevice& d, const T* input,
196 int32 prefix_dim_size, int32 split_dim_size,
197 int32 suffix_dim_size,
198 const GpuDeviceArrayStruct<T*>& output_ptr_data) {
199 GpuLaunchConfig config =
200 GetGpuLaunchConfig(prefix_dim_size * split_dim_size * suffix_dim_size, d);
201
202 TF_CHECK_OK(GpuLaunchKernel(SplitOpKernel<T>, config.block_count,
203 config.thread_per_block, 0, d.stream(), input,
204 prefix_dim_size, split_dim_size, suffix_dim_size,
205 output_ptr_data));
206 }
207
208 template <typename T, typename IntType>
Run(const Eigen::GpuDevice & gpu_device,bool fixed_size,const T * input_ptr,int total_rows,int total_cols,const GpuDeviceArrayStruct<IntType> & output_scan,const GpuDeviceArrayStruct<T * > & output_ptr_data)209 void SplitVOpGPULaunch<T, IntType>::Run(
210 const Eigen::GpuDevice& gpu_device, bool fixed_size, const T* input_ptr,
211 int total_rows, int total_cols,
212 const GpuDeviceArrayStruct<IntType>& output_scan,
213 const GpuDeviceArrayStruct<T*>& output_ptr_data) {
214 if (fixed_size) {
215 GpuLaunchConfig config =
216 GetGpuLaunchConfig(total_rows * total_cols, gpu_device);
217
218 TF_CHECK_OK(GpuLaunchKernel(SplitVOpKernel_fixed<T>, config.block_count,
219 config.thread_per_block, 0, gpu_device.stream(),
220 input_ptr, total_rows, total_cols,
221 output_ptr_data));
222 } else {
223 auto config = GetGpu2DLaunchConfig(total_cols, total_rows, gpu_device);
224 IntType smem_max = gpu_device.sharedMemPerBlock();
225 IntType smem_usage = output_scan.size * sizeof(IntType);
226 // performance crossover is less than using maximum available shared
227 // memory on most processors possibly due to decreasing occupancy
228 // 4096 inputs is a lot, most code will take the smem path
229 const int32 kMaxSmemBytesPerformance = 16384;
230 if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance) {
231 TF_CHECK_OK(GpuLaunchKernel(
232 split_v_kernel<T, IntType, true>, config.block_count,
233 config.thread_per_block, smem_usage, gpu_device.stream(), input_ptr,
234 output_scan, total_rows, total_cols, output_ptr_data));
235 } else {
236 TF_CHECK_OK(GpuLaunchKernel(
237 split_v_kernel<T, IntType, false>, config.block_count,
238 config.thread_per_block, 0, gpu_device.stream(), input_ptr,
239 output_scan, total_rows, total_cols, output_ptr_data));
240 }
241 }
242 }
243
244 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
245
246 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
247 TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
248 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
249 #undef REGISTER_GPU_KERNEL
250 #define REGISTER_GPU_KERNEL(T) \
251 template struct SplitVOpGPULaunch<T, int32>; \
252 template struct SplitVOpGPULaunch<T, int64>;
253
254 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
255 TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
256 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
257 #undef REGISTER_GPU_KERNEL
258
259 } // namespace tensorflow
260
261 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
262