• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 
18 #define EIGEN_USE_GPU
19 
20 #include <stdio.h>
21 
22 #include "tensorflow/core/framework/register_types.h"
23 #include "tensorflow/core/framework/tensor_types.h"
24 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
25 #include "tensorflow/core/kernels/split_lib.h"
26 #include "tensorflow/core/kernels/split_lib_gpu.h"
27 #include "tensorflow/core/util/gpu_kernel_helper.h"
28 
29 namespace tensorflow {
30 namespace functor {
31 
32 template <typename Device, typename T, int NDims>
operator ()(const Device & d,typename TTypes<T,NDims>::Tensor output,typename TTypes<T,NDims>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,NDims> & slice_sizes)33 void Split<Device, T, NDims>::operator()(
34     const Device& d, typename TTypes<T, NDims>::Tensor output,
35     typename TTypes<T, NDims>::ConstTensor input,
36     const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
37     const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
38   To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
39 }
40 
41 template <typename Device, typename T>
operator ()(const Device & d,typename TTypes<T,2>::Tensor output,typename TTypes<T,2>::ConstTensor input,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_indices,const Eigen::DSizes<Eigen::DenseIndex,2> & slice_sizes)42 void SplitCustom<Device, T>::operator()(
43     const Device& d, typename TTypes<T, 2>::Tensor output,
44     typename TTypes<T, 2>::ConstTensor input,
45     const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_indices,
46     const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_sizes) {
47   To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
48 }
49 
50 #define DEFINE_GPU_KERNELS(T)                    \
51   template struct Split<Eigen::GpuDevice, T, 2>; \
52   template struct Split<Eigen::GpuDevice, T, 3>;
53 
54 TF_CALL_int64(DEFINE_GPU_KERNELS);
55 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
56 TF_CALL_uint8(DEFINE_GPU_KERNELS);
57 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
58 
59 #undef DEFINE_GPU_KERNELS
60 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
61 
62 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
63 TF_CALL_COMPLEX_TYPES(DEFINE_GPU_KERNELS);
64 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
65 
66 #undef DEFINE_GPU_KERNELS
67 
68 }  // namespace functor
69 
70 namespace {
71 
72 template <typename T>
SplitOpKernel(const T * __restrict__ input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)73 __global__ void SplitOpKernel(const T* __restrict__ input,
74                               int32 prefix_dim_size, int32 split_dim_size,
75                               int32 suffix_dim_size,
76                               GpuDeviceArrayStruct<T*> output_ptr_data) {
77   const int32 num_split = output_ptr_data.size;
78   T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
79 
80   eigen_assert(blockDim.y == 1);
81   eigen_assert(blockDim.z == 1);
82   eigen_assert(split_dim_size % num_split == 0);
83 
84   int32 size = prefix_dim_size * split_dim_size * suffix_dim_size;
85   int32 piece_size = split_dim_size / num_split;
86 
87   GPU_1D_KERNEL_LOOP(offset, size) {
88     // Calculate the index into input from offset.
89     int32 i = offset / (split_dim_size * suffix_dim_size);
90     int32 j = (offset % (split_dim_size * suffix_dim_size)) / suffix_dim_size;
91     int32 k = offset % suffix_dim_size;
92 
93     // Find the output buffer that should be written to.
94     T* output_ptr = output_ptrs[j / piece_size];
95     // output_ptr is pointing to an array of size
96     //  [prefix_dim_size][piece_size][suffix_dim_size].
97     //
98     // output_ptr[i][j % piece_size][k] = input[offset];
99     // Linearize (i, j % piece_size, k) into an offset.
100     int32 output_offset = i * piece_size * suffix_dim_size +
101                           (j % piece_size) * suffix_dim_size + k;
102     *(output_ptr + output_offset) = ldg(input + offset);
103   }
104 }
105 
106 }  // namespace
107 
108 // cannot be in anonymous namespace due to extern shared memory
109 // very similar to the concat kernel except the input/output logic
110 // is reversed
111 template <typename T, typename IntType, bool useSmem>
split_v_kernel(const T * __restrict__ input_ptr,GpuDeviceArrayStruct<IntType> output_scan,IntType total_rows,IntType total_cols,GpuDeviceArrayStruct<T * > output_ptr_data)112 __global__ void split_v_kernel(const T* __restrict__ input_ptr,
113                                GpuDeviceArrayStruct<IntType> output_scan,
114                                IntType total_rows, IntType total_cols,
115                                GpuDeviceArrayStruct<T*> output_ptr_data) {
116   T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
117   IntType* col_scan = GetGpuDeviceArrayOnDevice(&output_scan);
118 
119   // do upper_bound on col to find which pointer we should be using
120   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
121   int num_outputs = output_ptr_data.size;
122 
123   // verbose declaration needed due to template
124   GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(T), unsigned char, smem);
125   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
126 
127   if (useSmem) {
128     IntType lidx = threadIdx.y * blockDim.x + threadIdx.x;
129     IntType blockSize = blockDim.x * blockDim.y;
130 
131     for (IntType i = lidx; i < output_scan.size; i += blockSize) {
132       smem_col_scan[i] = col_scan[i];
133     }
134 
135     __syncthreads();
136 
137     col_scan = smem_col_scan;
138   }
139 
140   // do an initial binary search and then scan linearly from there
141   // works well when there are many small segments and when the
142   // segments are much longer
143   IntType segment =
144       gpu_helper::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
145 
146   IntType curr_offset = col_scan[segment];
147   IntType curr_segment = segment;
148   for (; gidx < total_cols; gidx += blockDim.x * gridDim.x) {
149     IntType curr_col_offset;
150     while ((curr_col_offset = col_scan[curr_segment + 1]) <= gidx) {
151       curr_offset = curr_col_offset;
152       ++curr_segment;
153     }
154 
155     IntType local_col = gidx - curr_offset;
156     IntType segment_width = curr_col_offset - curr_offset;
157     T* output_ptr = output_ptrs[curr_segment];
158 
159     IntType gidy = blockIdx.y * blockDim.y + threadIdx.y;
160     for (; gidy < total_rows; gidy += blockDim.y * gridDim.y)
161       output_ptr[gidy * segment_width + local_col] =
162           input_ptr[gidy * total_cols + gidx];
163   }
164 }
165 
166 // different from the original split implementation due to 2D vs 3D
167 // dimensions.  This version is likely faster due to less integer math.
168 template <typename T>
SplitVOpKernel_fixed(const T * __restrict__ input,int32 prefix_dim_size,int32 suffix_dim_size,GpuDeviceArrayStruct<T * > output_ptr_data)169 __global__ void SplitVOpKernel_fixed(const T* __restrict__ input,
170                                      int32 prefix_dim_size,
171                                      int32 suffix_dim_size,
172                                      GpuDeviceArrayStruct<T*> output_ptr_data) {
173   const int32 num_split = output_ptr_data.size;
174   T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
175 
176   eigen_assert(blockDim.y == 1);
177   eigen_assert(blockDim.z == 1);
178 
179   int32 size = prefix_dim_size * suffix_dim_size;
180   int32 piece_size = suffix_dim_size / num_split;
181 
182   GPU_1D_KERNEL_LOOP(offset, size) {
183     // Calculate the index into input from offset.
184     int32 i = offset / suffix_dim_size;
185     int32 j = offset % suffix_dim_size;
186 
187     // Find the output buffer that should be written to.
188     T* output_ptr = output_ptrs[j / piece_size];
189     int32 output_offset = i * piece_size + (j % piece_size);
190     output_ptr[output_offset] = input[offset];
191   }
192 }
193 
194 template <typename T>
Run(const Eigen::GpuDevice & d,const T * input,int32 prefix_dim_size,int32 split_dim_size,int32 suffix_dim_size,const GpuDeviceArrayStruct<T * > & output_ptr_data)195 void SplitOpGPULaunch<T>::Run(const Eigen::GpuDevice& d, const T* input,
196                               int32 prefix_dim_size, int32 split_dim_size,
197                               int32 suffix_dim_size,
198                               const GpuDeviceArrayStruct<T*>& output_ptr_data) {
199   GpuLaunchConfig config =
200       GetGpuLaunchConfig(prefix_dim_size * split_dim_size * suffix_dim_size, d);
201 
202   TF_CHECK_OK(GpuLaunchKernel(SplitOpKernel<T>, config.block_count,
203                               config.thread_per_block, 0, d.stream(), input,
204                               prefix_dim_size, split_dim_size, suffix_dim_size,
205                               output_ptr_data));
206 }
207 
208 template <typename T, typename IntType>
Run(const Eigen::GpuDevice & gpu_device,bool fixed_size,const T * input_ptr,int total_rows,int total_cols,const GpuDeviceArrayStruct<IntType> & output_scan,const GpuDeviceArrayStruct<T * > & output_ptr_data)209 void SplitVOpGPULaunch<T, IntType>::Run(
210     const Eigen::GpuDevice& gpu_device, bool fixed_size, const T* input_ptr,
211     int total_rows, int total_cols,
212     const GpuDeviceArrayStruct<IntType>& output_scan,
213     const GpuDeviceArrayStruct<T*>& output_ptr_data) {
214   if (fixed_size) {
215     GpuLaunchConfig config =
216         GetGpuLaunchConfig(total_rows * total_cols, gpu_device);
217 
218     TF_CHECK_OK(GpuLaunchKernel(SplitVOpKernel_fixed<T>, config.block_count,
219                                 config.thread_per_block, 0, gpu_device.stream(),
220                                 input_ptr, total_rows, total_cols,
221                                 output_ptr_data));
222   } else {
223     auto config = GetGpu2DLaunchConfig(total_cols, total_rows, gpu_device);
224     IntType smem_max = gpu_device.sharedMemPerBlock();
225     IntType smem_usage = output_scan.size * sizeof(IntType);
226     // performance crossover is less than using maximum available shared
227     // memory on most processors possibly due to decreasing occupancy
228     // 4096 inputs is a lot, most code will take the smem path
229     const int32 kMaxSmemBytesPerformance = 16384;
230     if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance) {
231       TF_CHECK_OK(GpuLaunchKernel(
232           split_v_kernel<T, IntType, true>, config.block_count,
233           config.thread_per_block, smem_usage, gpu_device.stream(), input_ptr,
234           output_scan, total_rows, total_cols, output_ptr_data));
235     } else {
236       TF_CHECK_OK(GpuLaunchKernel(
237           split_v_kernel<T, IntType, false>, config.block_count,
238           config.thread_per_block, 0, gpu_device.stream(), input_ptr,
239           output_scan, total_rows, total_cols, output_ptr_data));
240     }
241   }
242 }
243 
244 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
245 
246 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
247 TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
248 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
249 #undef REGISTER_GPU_KERNEL
250 #define REGISTER_GPU_KERNEL(T)                 \
251   template struct SplitVOpGPULaunch<T, int32>; \
252   template struct SplitVOpGPULaunch<T, int64>;
253 
254 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
255 TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
256 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
257 #undef REGISTER_GPU_KERNEL
258 
259 }  // namespace tensorflow
260 
261 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
262