• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA
17 
18 #define EIGEN_USE_GPU
19 
20 #include <stdio.h>
21 
22 #include "tensorflow/core/kernels/split_lib.h"
23 
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor_types.h"
26 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
27 #include "tensorflow/core/kernels/split_lib_gpu.h"
28 #include "tensorflow/core/util/cuda_kernel_helper.h"
29 
30 namespace tensorflow {
31 namespace functor {
32 
33 template <typename Device, typename T, int NDims>
operator ()(const Device & d,typename TTypes<T,NDims>::Tensor output,typename TTypes<T,NDims>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_sizes)34 void Split<Device, T, NDims>::operator()(
35     const Device& d, typename TTypes<T, NDims>::Tensor output,
36     typename TTypes<T, NDims>::ConstTensor input,
37     const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
38     const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
39   To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
40 }
41 
42 template <typename Device, typename T>
operator ()(const Device & d,typename TTypes<T,2>::Tensor output,typename TTypes<T,2>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_sizes)43 void SplitCustom<Device, T>::operator()(
44     const Device& d, typename TTypes<T, 2>::Tensor output,
45     typename TTypes<T, 2>::ConstTensor input,
46     const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_indices,
47     const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_sizes) {
48   To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
49 }
50 
51 #define DEFINE_GPU_KERNELS(T)                    \
52   template struct Split<Eigen::GpuDevice, T, 2>; \
53   template struct Split<Eigen::GpuDevice, T, 3>;
54 
55 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
56 TF_CALL_complex64(DEFINE_GPU_KERNELS);
57 TF_CALL_complex128(DEFINE_GPU_KERNELS);
58 TF_CALL_int64(DEFINE_GPU_KERNELS);
59 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
60 TF_CALL_uint8(DEFINE_GPU_KERNELS);
61 TF_CALL_bool(DEFINE_GPU_KERNELS);
62 
63 #undef DEFINE_GPU_KERNELS
64 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
65 
66 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
67 TF_CALL_complex64(DEFINE_GPU_KERNELS);
68 TF_CALL_complex128(DEFINE_GPU_KERNELS);
69 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
70 
71 #undef DEFINE_GPU_KERNELS
72 
73 }  // namespace functor
74 
75 namespace {
76 
77 template <typename T>
SplitOpKernel(const T * input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)78 __global__ void SplitOpKernel(const T* input, int32 prefix_dim_size,
79                               int32 split_dim_size, int32 suffix_dim_size,
80                               GpuDeviceArrayStruct<T*> output_ptr_data) {
81   const int32 num_split = output_ptr_data.size;
82   T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
83 
84   eigen_assert(blockDim.y == 1);
85   eigen_assert(blockDim.z == 1);
86   eigen_assert(split_dim_size % num_split == 0);
87 
88   int32 size = prefix_dim_size * split_dim_size * suffix_dim_size;
89   int32 piece_size = split_dim_size / num_split;
90 
91   CUDA_1D_KERNEL_LOOP(offset, size) {
92     // Calculate the index into input from offset.
93     int32 i = offset / (split_dim_size * suffix_dim_size);
94     int32 j = (offset % (split_dim_size * suffix_dim_size)) / suffix_dim_size;
95     int32 k = offset % suffix_dim_size;
96 
97     // Find the output buffer that should be written to.
98     T* output_ptr = output_ptrs[j / piece_size];
99     // output_ptr is pointing to an array of size
100     //  [prefix_dim_size][piece_size][suffix_dim_size].
101     //
102     // output_ptr[i][j % piece_size][k] = input[offset];
103     // Linearize (i, j % piece_size, k) into an offset.
104     int32 output_offset = i * piece_size * suffix_dim_size +
105                           (j % piece_size) * suffix_dim_size + k;
106     *(output_ptr + output_offset) = ldg(input + offset);
107   }
108 }
109 
110 }  // namespace
111 
112 // cannot be in anonymous namespace due to extern shared memory
113 // very similar to the concat kernel except the input/output logic
114 // is reversed
115 template <typename T, typename IntType, bool useSmem>
split_v_kernel(const T * input_ptr,GpuDeviceArrayStruct<IntType> output_scan,IntType total_rows,IntType total_cols,GpuDeviceArrayStruct<T * > output_ptr_data)116 __global__ void split_v_kernel(const T* input_ptr,
117                                GpuDeviceArrayStruct<IntType> output_scan,
118                                IntType total_rows, IntType total_cols,
119                                GpuDeviceArrayStruct<T*> output_ptr_data) {
120   T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
121   IntType* col_scan = GetGpuDeviceArrayOnDevice(&output_scan);
122 
123   // do upper_bound on col to find which pointer we should be using
124   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
125   int num_outputs = output_ptr_data.size;
126 
127   // verbose declaration needed due to template
128   extern __shared__ __align__(sizeof(T)) unsigned char smem[];
129   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
130 
131   if (useSmem) {
132     IntType lidx = threadIdx.y * blockDim.x + threadIdx.x;
133     IntType blockSize = blockDim.x * blockDim.y;
134 
135     for (IntType i = lidx; i < output_scan.size; i += blockSize) {
136       smem_col_scan[i] = col_scan[i];
137     }
138 
139     __syncthreads();
140 
141     col_scan = smem_col_scan;
142   }
143 
144   // do an initial binary search and then scan linearly from there
145   // works well when there are many small segments and when the
146   // segments are much longer
147   IntType segment =
148       cuda_helper::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
149 
150   IntType curr_offset = col_scan[segment];
151   IntType curr_segment = segment;
152   for (; gidx < total_cols; gidx += blockDim.x * gridDim.x) {
153     IntType curr_col_offset;
154     while ((curr_col_offset = col_scan[curr_segment + 1]) <= gidx) {
155       curr_offset = curr_col_offset;
156       ++curr_segment;
157     }
158 
159     IntType local_col = gidx - curr_offset;
160     IntType segment_width = curr_col_offset - curr_offset;
161     T* output_ptr = output_ptrs[curr_segment];
162 
163     IntType gidy = blockIdx.y * blockDim.y + threadIdx.y;
164     for (; gidy < total_rows; gidy += blockDim.y * gridDim.y)
165       output_ptr[gidy * segment_width + local_col] =
166           input_ptr[gidy * total_cols + gidx];
167   }
168 }
169 
170 // different from the original split implementation due to 2D vs 3D
171 // dimensions.  This version is likely faster due to less integer math.
172 template <typename T>
SplitVOpKernel_fixed(const T * input,int32 prefix_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)173 __global__ void SplitVOpKernel_fixed(const T* input, int32 prefix_dim_size,
174                                      int32 suffix_dim_size,
175                                      GpuDeviceArrayStruct<T*> output_ptr_data) {
176   const int32 num_split = output_ptr_data.size;
177   T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
178 
179   eigen_assert(blockDim.y == 1);
180   eigen_assert(blockDim.z == 1);
181 
182   int32 size = prefix_dim_size * suffix_dim_size;
183   int32 piece_size = suffix_dim_size / num_split;
184 
185   CUDA_1D_KERNEL_LOOP(offset, size) {
186     // Calculate the index into input from offset.
187     int32 i = offset / suffix_dim_size;
188     int32 j = offset % suffix_dim_size;
189 
190     // Find the output buffer that should be written to.
191     T* output_ptr = output_ptrs[j / piece_size];
192     int32 output_offset = i * piece_size + (j % piece_size);
193     output_ptr[output_offset] = input[offset];
194   }
195 }
196 
197 template <typename T>
Run(const Eigen::GpuDevice & d,const T * input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,const GpuDeviceArrayStruct<T * > & output_ptr_data)198 void SplitOpGPULaunch<T>::Run(const Eigen::GpuDevice& d, const T* input,
199                               int32 prefix_dim_size, int32 split_dim_size,
200                               int32 suffix_dim_size,
201                               const GpuDeviceArrayStruct<T*>& output_ptr_data) {
202   CudaLaunchConfig config = GetCudaLaunchConfig(
203       prefix_dim_size * split_dim_size * suffix_dim_size, d);
204 
205   TF_CHECK_OK(CudaLaunchKernel(SplitOpKernel<T>, config.block_count,
206                                config.thread_per_block, 0, d.stream(), input,
207                                prefix_dim_size, split_dim_size, suffix_dim_size,
208                                output_ptr_data));
209 }
210 
211 template <typename T, typename IntType>
Run(const Eigen::GpuDevice & gpu_device,bool fixed_size,const T * input_ptr,int total_rows,int total_cols,const GpuDeviceArrayStruct<IntType> & output_scan,const GpuDeviceArrayStruct<T * > & output_ptr_data)212 void SplitVOpGPULaunch<T, IntType>::Run(
213     const Eigen::GpuDevice& gpu_device, bool fixed_size, const T* input_ptr,
214     int total_rows, int total_cols,
215     const GpuDeviceArrayStruct<IntType>& output_scan,
216     const GpuDeviceArrayStruct<T*>& output_ptr_data) {
217   if (fixed_size) {
218     CudaLaunchConfig config =
219         GetCudaLaunchConfig(total_rows * total_cols, gpu_device);
220 
221     SplitVOpKernel_fixed<T><<<config.block_count, config.thread_per_block, 0,
222                               gpu_device.stream()>>>(
223         input_ptr, total_rows, total_cols, output_ptr_data);
224   } else {
225     auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device);
226     IntType smem_max = gpu_device.sharedMemPerBlock();
227     IntType smem_usage = output_scan.size * sizeof(IntType);
228     // performance crossover is less than using maximum available shared
229     // memory on most processors possibly due to decreasing occupancy
230     // 4096 inputs is a lot, most code will take the smem path
231     const int32 kMaxSmemBytesPerformance = 16384;
232     if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
233       split_v_kernel<T, IntType, true>
234           <<<config.block_count, config.thread_per_block, smem_usage,
235              gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
236                                     total_cols, output_ptr_data);
237     else
238       split_v_kernel<T, IntType, false>
239           <<<config.block_count, config.thread_per_block, 0,
240              gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
241                                     total_cols, output_ptr_data);
242   }
243 }
244 
245 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
246 
247 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
248 TF_CALL_complex64(REGISTER_GPU_KERNEL);
249 TF_CALL_complex128(REGISTER_GPU_KERNEL);
250 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
251 #undef REGISTER_GPU_KERNEL
252 #define REGISTER_GPU_KERNEL(T)                 \
253   template struct SplitVOpGPULaunch<T, int32>; \
254   template struct SplitVOpGPULaunch<T, int64>;
255 
256 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
257 TF_CALL_complex64(REGISTER_GPU_KERNEL);
258 TF_CALL_complex128(REGISTER_GPU_KERNEL);
259 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
260 #undef REGISTER_GPU_KERNEL
261 
262 }  // namespace tensorflow
263 
264 #endif  // GOOGLE_CUDA
265