• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/array_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/register_types.h"
24 #include "tensorflow/core/framework/tensor.h"
25 #include "tensorflow/core/kernels/ops_util.h"
26 #include "tensorflow/core/kernels/split_lib.h"
27 #include "tensorflow/core/lib/core/status.h"
28 #include "tensorflow/core/lib/gtl/array_slice.h"
29 #include "tensorflow/core/util/work_sharder.h"
30 #if GOOGLE_CUDA
31 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
32 #include "tensorflow/core/kernels/gpu_device_array.h"
33 #include "tensorflow/core/kernels/split_lib_gpu.h"
34 #include "tensorflow/core/platform/stream_executor.h"
35 #endif  // GOOGLE_CUDA
36 
37 namespace tensorflow {
38 
39 typedef Eigen::ThreadPoolDevice CPUDevice;
40 typedef Eigen::GpuDevice GPUDevice;
41 #ifdef TENSORFLOW_USE_SYCL
42 typedef Eigen::SyclDevice SYCLDevice;
43 #endif  // TENSORFLOW_USE_SYCL
44 
45 template <typename Device, typename T>
46 class SplitOpBase : public OpKernel {
47  public:
SplitOpBase(OpKernelConstruction * c)48   explicit SplitOpBase(OpKernelConstruction* c) : OpKernel(c) {}
49 
ComputeEasyCases(OpKernelContext * context,bool * done)50   void ComputeEasyCases(OpKernelContext* context, bool* done) {
51     const Tensor& input = context->input(1);
52     const TensorShape& input_shape = input.shape();
53     const Tensor& split_dim_tensor = context->input(0);
54     OP_REQUIRES(
55         context, split_dim_tensor.shape().dims() == 0,
56         errors::InvalidArgument("split_dim must be a scalar but has rank ",
57                                 split_dim_tensor.shape().dims()));
58     const int32 split_dim_orig = split_dim_tensor.flat<int32>()(0);
59     const int32 split_dim =
60         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
61     const int32 num_split = num_outputs();
62 
63     OP_REQUIRES(
64         context, 0 <= split_dim && split_dim < input_shape.dims(),
65         errors::InvalidArgument("-input rank(-", input.dims(),
66                                 ") <= split_dim < input rank (", input.dims(),
67                                 "), but got ", split_dim_orig));
68 
69     OP_REQUIRES(
70         context, num_split > 0,
71         errors::InvalidArgument(
72             "Number of ways to split should be > 0, but got ", num_split));
73 
74     OP_REQUIRES(context, input_shape.dim_size(split_dim) % num_split == 0,
75                 errors::InvalidArgument(
76                     "Number of ways to split should evenly divide the split "
77                     "dimension, but got split_dim ",
78                     split_dim, " (size = ", input_shape.dim_size(split_dim),
79                     ") ", "and num_split ", num_split));
80     // Special case 1: num_split == 1. Nothing to do.
81     if (num_split == 1) {
82       VLOG(1) << "Split identity";
83       context->set_output(0, context->input(1));
84       *done = true;
85       return;
86     }
87 
88     // Special case 2: split along the 1st dimension. We can share the
89     // underlying buffer.
90     //
91     // Apply this optimization conservatively: if input is aligned,
92     // the resulting tensors must be aligned. It's conservative
93     // because if the immediate consumer of the resulting tensors are
94     // not using eigen for computation, its perfectly fine to avoid
95     // the copying.
96     if ((split_dim == 0) && IsInnerDimsSizeAligned<T>(input_shape)) {
97       VLOG(1) << "Slice dim 0: " << input_shape.DebugString();
98       const int64 delta = input_shape.dim_size(0) / num_split;
99       for (int i = 0; i < num_split; ++i) {
100         context->set_output(i, input.Slice(i * delta, (i + 1) * delta));
101       }
102       *done = true;
103       return;
104     }
105   }
106 
107   template <typename IndexType>
SetDims(const TensorShape & input_shape,int32 split_dim) const108   std::tuple<IndexType, IndexType, IndexType> SetDims(
109       const TensorShape& input_shape, int32 split_dim) const {
110     static_assert(std::is_integral<IndexType>::value,
111                   "IndexType must be an integer type");
112     int32 prefix_dim_size = 1;
113     for (int i = 0; i < split_dim; ++i) {
114       prefix_dim_size *= input_shape.dim_size(i);
115     }
116 
117     // Caller must ensure that dim_size and suffix_dim_size are <
118     // std::numeric_limits<IndexType>::max()
119     IndexType split_dim_size =
120         static_cast<IndexType>(input_shape.dim_size(split_dim));
121 
122     IndexType suffix_dim_size = 1;
123     for (int i = split_dim + 1; i < input_shape.dims(); ++i) {
124       suffix_dim_size *= static_cast<IndexType>(input_shape.dim_size(i));
125     }
126     return std::make_tuple(prefix_dim_size, split_dim_size, suffix_dim_size);
127   }
128 };
129 
130 template <typename T, typename InputReshapedType, int NDims>
131 class SplitOpCPUImpl {
132  public:
133   template <typename MakeSizesType, typename ReshapeResultType>
operator ()(OpKernelContext * context,const InputReshapedType & input_reshaped,const TensorShape & input_shape,int32 split_dim,Eigen::DenseIndex prefix_dim_size,Eigen::DenseIndex split_dim_size,Eigen::DenseIndex suffix_dim_size,const MakeSizesType & make_sizes,const ReshapeResultType & reshape_result,int32 num_split,int64 split_dim_output_size) const134   void operator()(OpKernelContext* context,
135                   const InputReshapedType& input_reshaped,
136                   const TensorShape& input_shape, int32 split_dim,
137                   Eigen::DenseIndex prefix_dim_size,
138                   Eigen::DenseIndex split_dim_size,
139                   Eigen::DenseIndex suffix_dim_size,
140                   const MakeSizesType& make_sizes,
141                   const ReshapeResultType& reshape_result, int32 num_split,
142                   int64 split_dim_output_size) const {
143     const auto num_threads =
144         context->device()->tensorflow_cpu_worker_threads()->num_threads;
145     // TODO(jewillco): Tune heuristic further.
146     const auto input_element_count = input_shape.num_elements();
147     const bool use_parallelism_between_outputs =
148         (num_split >= 4 &&
149          input_element_count >= std::max(num_threads, num_split) * 4096 &&
150          input_element_count < num_split * 180 * 1024);
151     Eigen::DSizes<Eigen::DenseIndex, NDims> indices;
152     for (int i = 0; i < NDims; ++i) {
153       indices[i] = 0;
154     }
155     auto sizes = make_sizes(split_dim_output_size);
156     TensorShape output_shape(input_shape);
157     output_shape.set_dim(split_dim, split_dim_output_size);
158 
159     auto range_output_func = [&indices, context, &output_shape, prefix_dim_size,
160                               split_dim_output_size, suffix_dim_size, &sizes,
161                               use_parallelism_between_outputs, &input_reshaped,
162                               &reshape_result](int64 start, int64 limit) {
163       for (int64 i = start; i < limit; ++i) {
164         Tensor* result = nullptr;
165         OP_REQUIRES_OK(context,
166                        context->allocate_output(i, output_shape, &result));
167         if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
168           Eigen::DSizes<Eigen::DenseIndex, NDims> slice_indices;
169           Eigen::DSizes<Eigen::DenseIndex, NDims> slice_sizes;
170           for (int j = 0; j < NDims; ++j) {
171             slice_indices[j] =
172                 (j == NDims - 2 ? i * split_dim_output_size : indices[j]);
173             slice_sizes[j] = sizes[j];
174           }
175 
176           auto result_shaped = reshape_result(result, split_dim_output_size);
177 
178           if (use_parallelism_between_outputs) {
179             // Use sequential implementation for single output.
180             result_shaped = input_reshaped.slice(slice_indices, slice_sizes);
181           } else {
182             // This implementation may be parallel internally.
183             functor::Split<CPUDevice, T, NDims>()(
184                 context->eigen_device<CPUDevice>(), result_shaped,
185                 input_reshaped, slice_indices, slice_sizes);
186           }
187         }
188       }
189     };
190     if (use_parallelism_between_outputs) {
191       // Run in parallel, disabling parallelism in functor.
192       context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
193           num_split, input_element_count / num_split, range_output_func);
194     } else {
195       // Run sequentially, but allow internal parallelism in functor.
196       range_output_func(0, num_split);
197     }
198   }
199 };
200 
201 template <typename T>
202 class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
203  public:
204   typedef SplitOpBase<CPUDevice, T> Base;
SplitOpCPU(OpKernelConstruction * c)205   explicit SplitOpCPU(OpKernelConstruction* c) : Base(c) {}
206 
Compute(OpKernelContext * context)207   void Compute(OpKernelContext* context) override {
208     bool done = false;
209     Base::ComputeEasyCases(context, &done);
210     if (!context->status().ok() || done) {
211       return;
212     }
213     const int32 num_split = Base::num_outputs();
214     const Tensor& input = context->input(1);
215     const TensorShape& input_shape = input.shape();
216     const int32 split_dim_orig = context->input(0).flat<int32>()(0);
217     const int32 split_dim =
218         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
219 
220     // Android also uses int32 indexing, so check here also.
221     OP_REQUIRES(
222         context,
223         FastBoundsCheck(input.NumElements(),
224                         std::numeric_limits<Eigen::DenseIndex>::max()),
225         errors::InvalidArgument("Split requires input size < ",
226                                 std::numeric_limits<Eigen::DenseIndex>::max()));
227 
228     Eigen::DenseIndex prefix_dim_size;
229     Eigen::DenseIndex split_dim_size;
230     Eigen::DenseIndex suffix_dim_size;
231 
232     std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
233         Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
234 
235     const int64 split_dim_output_size = split_dim_size / num_split;
236 
237     if (prefix_dim_size == 1) {
238       auto input_reshaped =
239           input.shaped<T, 2>({split_dim_size, suffix_dim_size});
240       auto make_sizes = [&](Eigen::DenseIndex split_size) {
241         return Eigen::DSizes<Eigen::DenseIndex, 2>{split_size, suffix_dim_size};
242       };
243       auto reshape_result = [&](Tensor* result, Eigen::DenseIndex split_size) {
244         return result->shaped<T, 2>({split_size, suffix_dim_size});
245       };
246       SplitOpCPUImpl<T, decltype(input_reshaped), 2>{}(
247           context, input_reshaped, input_shape, split_dim, prefix_dim_size,
248           split_dim_size, suffix_dim_size, make_sizes, reshape_result,
249           num_split, split_dim_output_size);
250     } else {
251       auto input_reshaped = input.shaped<T, 3>(
252           {prefix_dim_size, split_dim_size, suffix_dim_size});
253       auto make_sizes = [&](Eigen::DenseIndex split_size) {
254         return Eigen::DSizes<Eigen::DenseIndex, 3>{prefix_dim_size, split_size,
255                                                    suffix_dim_size};
256       };
257       auto reshape_result = [&](Tensor* result, Eigen::DenseIndex split_size) {
258         return result->shaped<T, 3>(
259             {prefix_dim_size, split_size, suffix_dim_size});
260       };
261       SplitOpCPUImpl<T, decltype(input_reshaped), 3>{}(
262           context, input_reshaped, input_shape, split_dim, prefix_dim_size,
263           split_dim_size, suffix_dim_size, make_sizes, reshape_result,
264           num_split, split_dim_output_size);
265     }
266   }
267 };
268 
269 #if GOOGLE_CUDA
270 
271 // Partial specialization for GPU
272 template <typename T>
273 class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
274  public:
275   typedef SplitOpBase<GPUDevice, T> Base;
SplitOpGPU(OpKernelConstruction * c)276   explicit SplitOpGPU(OpKernelConstruction* c) : Base(c) {}
277 
Compute(OpKernelContext * context)278   void Compute(OpKernelContext* context) override {
279     bool done = false;
280     Base::ComputeEasyCases(context, &done);
281     if (!context->status().ok() || done) {
282       return;
283     }
284     const Tensor& input = context->input(1);
285     const TensorShape& input_shape = input.shape();
286     const int32 split_dim_orig = context->input(0).flat<int32>()(0);
287     const int32 split_dim =
288         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
289     const int32 num_split = Base::num_outputs();
290     OP_REQUIRES(
291         context,
292         FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
293         errors::InvalidArgument("Split on GPU requires input size "
294                                 "< max int32"));
295     int32 prefix_dim_size;
296     int32 split_dim_size;
297     int32 suffix_dim_size;
298     std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
299         Base::template SetDims<int32>(input_shape, split_dim);
300 
301     const int32 split_dim_output_size = split_dim_size / num_split;
302     TensorShape output_shape(input_shape);
303     output_shape.set_dim(split_dim, split_dim_output_size);
304 
305     GpuDeviceArrayOnHost<T*> ptrs(context, num_split);
306     OP_REQUIRES_OK(context, ptrs.Init());
307 
308     for (int i = 0; i < num_split; ++i) {
309       Tensor* result = nullptr;
310       OP_REQUIRES_OK(context,
311                      context->allocate_output(i, output_shape, &result));
312       ptrs.Set(i, result->flat<T>().data());
313     }
314     if (prefix_dim_size * split_dim_output_size * suffix_dim_size == 0) {
315       return;
316     }
317     OP_REQUIRES_OK(context, ptrs.Finalize());
318 
319     SplitOpGPULaunch<T>().Run(context->eigen_device<GPUDevice>(),
320                               input.flat<T>().data(), prefix_dim_size,
321                               split_dim_size, suffix_dim_size, ptrs.data());
322     OP_REQUIRES(context, context->op_device_context()->stream()->ok(),
323                 errors::Internal("Launch of gpu kernel for SplitOp failed"));
324   }
325 };
326 #endif  // GOOGLE_CUDA
327 
328 #ifdef TENSORFLOW_USE_SYCL
329 template <typename T>
330 class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
331  public:
332   typedef SplitOpBase<SYCLDevice, T> Base;
SplitOpSYCL(OpKernelConstruction * c)333   explicit SplitOpSYCL(OpKernelConstruction* c) : Base(c) {}
334 
Compute(OpKernelContext * context)335   void Compute(OpKernelContext* context) override {
336     bool done = false;
337     Base::ComputeEasyCases(context, &done);
338     if (!context->status().ok() || done) {
339       return;
340     }
341     const Tensor& input = context->input(1);
342     const TensorShape& input_shape = input.shape();
343     const int32 split_dim_orig = context->input(0).flat<int32>()(0);
344     const int32 split_dim =
345         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
346     const int32 num_split = Base::num_outputs();
347 
348     // Android also uses int32 indexing, so check here also.
349     OP_REQUIRES(
350         context,
351         FastBoundsCheck(input.NumElements(),
352                         std::numeric_limits<Eigen::DenseIndex>::max()),
353         errors::InvalidArgument("Split requires input size < ",
354                                 std::numeric_limits<Eigen::DenseIndex>::max()));
355 
356     Eigen::DenseIndex prefix_dim_size;
357     Eigen::DenseIndex split_dim_size;
358     Eigen::DenseIndex suffix_dim_size;
359 
360     std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
361         Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
362     auto input_reshaped =
363         input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size});
364 
365     const int64 split_dim_output_size = split_dim_size / num_split;
366     TensorShape output_shape(input_shape);
367     output_shape.set_dim(split_dim, split_dim_output_size);
368 
369     Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
370     Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
371         prefix_dim_size, split_dim_output_size, suffix_dim_size};
372 
373     for (int i = 0; i < num_split; ++i) {
374       Tensor* result = nullptr;
375       OP_REQUIRES_OK(context,
376                      context->allocate_output(i, output_shape, &result));
377       if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
378         Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices;
379         Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes;
380         for (int j = 0; j < 3; ++j) {
381           slice_indices[j] = indices[j];
382           slice_sizes[j] = sizes[j];
383         }
384 
385         auto result_shaped = result->shaped<T, 3>(
386             {prefix_dim_size, split_dim_output_size, suffix_dim_size});
387 
388         functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
389                                         result_shaped, input_reshaped,
390                                         slice_indices, slice_sizes);
391       }
392       indices[1] += split_dim_output_size;
393     }
394   }
395 };
396 #endif  // TENSORFLOW_USE_SYCL
397 
398 #define REGISTER_SPLIT(type)                             \
399   REGISTER_KERNEL_BUILDER(Name("Split")                  \
400                               .Device(DEVICE_CPU)        \
401                               .TypeConstraint<type>("T") \
402                               .HostMemory("split_dim"),  \
403                           SplitOpCPU<type>)
404 
405 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
406 REGISTER_SPLIT(quint8);
407 
408 #undef REGISTER_SPLIT
409 
410 #if GOOGLE_CUDA
411 
412 #define REGISTER_GPU(type)                               \
413   REGISTER_KERNEL_BUILDER(Name("Split")                  \
414                               .Device(DEVICE_GPU)        \
415                               .TypeConstraint<type>("T") \
416                               .HostMemory("split_dim"),  \
417                           SplitOpGPU<type>)
418 
419 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
420 TF_CALL_complex64(REGISTER_GPU);
421 TF_CALL_complex128(REGISTER_GPU);
422 REGISTER_GPU(bfloat16);
423 #undef REGISTER_GPU
424 
425 #endif  // GOOGLE_CUDA
426 
427 #ifdef TENSORFLOW_USE_SYCL
428 #define REGISTER_SYCL(type)                              \
429   REGISTER_KERNEL_BUILDER(Name("Split")                  \
430                               .Device(DEVICE_SYCL)       \
431                               .TypeConstraint<type>("T") \
432                               .HostMemory("split_dim"),  \
433                           SplitOpSYCL<type>)
434 
435 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
436 #undef REGISTER_SYCL
437 
438 #endif  // TENSORFLOW_USE_SYCL
439 
440 }  // end namespace tensorflow
441