1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/array_ops.cc. 17 18 #define EIGEN_USE_THREADS 19 20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 21 #include "tensorflow/core/framework/bounds_check.h" 22 #include "tensorflow/core/framework/op_kernel.h" 23 #include "tensorflow/core/framework/register_types.h" 24 #include "tensorflow/core/framework/tensor.h" 25 #include "tensorflow/core/kernels/ops_util.h" 26 #include "tensorflow/core/kernels/split_lib.h" 27 #include "tensorflow/core/lib/core/status.h" 28 #include "tensorflow/core/lib/gtl/array_slice.h" 29 #include "tensorflow/core/util/work_sharder.h" 30 #if GOOGLE_CUDA 31 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 32 #include "tensorflow/core/kernels/gpu_device_array.h" 33 #include "tensorflow/core/kernels/split_lib_gpu.h" 34 #include "tensorflow/core/platform/stream_executor.h" 35 #endif // GOOGLE_CUDA 36 37 namespace tensorflow { 38 39 typedef Eigen::ThreadPoolDevice CPUDevice; 40 typedef Eigen::GpuDevice GPUDevice; 41 #ifdef TENSORFLOW_USE_SYCL 42 typedef Eigen::SyclDevice SYCLDevice; 43 #endif // TENSORFLOW_USE_SYCL 44 45 template <typename Device, typename T> 46 class SplitOpBase : public OpKernel { 47 public: SplitOpBase(OpKernelConstruction * c)48 explicit SplitOpBase(OpKernelConstruction* c) : OpKernel(c) {} 49 ComputeEasyCases(OpKernelContext * context,bool * done)50 void ComputeEasyCases(OpKernelContext* context, bool* done) { 51 const Tensor& input = context->input(1); 52 const TensorShape& input_shape = input.shape(); 53 const Tensor& split_dim_tensor = context->input(0); 54 OP_REQUIRES( 55 context, split_dim_tensor.shape().dims() == 0, 56 errors::InvalidArgument("split_dim must be a scalar but has rank ", 57 split_dim_tensor.shape().dims())); 58 const int32 split_dim_orig = split_dim_tensor.flat<int32>()(0); 59 const int32 split_dim = 60 split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; 61 const int32 num_split = num_outputs(); 62 63 OP_REQUIRES( 64 context, 0 <= split_dim && split_dim < input_shape.dims(), 65 errors::InvalidArgument("-input rank(-", input.dims(), 66 ") <= split_dim < input rank (", input.dims(), 67 "), but got ", split_dim_orig)); 68 69 OP_REQUIRES( 70 context, num_split > 0, 71 errors::InvalidArgument( 72 "Number of ways to split should be > 0, but got ", num_split)); 73 74 OP_REQUIRES(context, input_shape.dim_size(split_dim) % num_split == 0, 75 errors::InvalidArgument( 76 "Number of ways to split should evenly divide the split " 77 "dimension, but got split_dim ", 78 split_dim, " (size = ", input_shape.dim_size(split_dim), 79 ") ", "and num_split ", num_split)); 80 // Special case 1: num_split == 1. Nothing to do. 81 if (num_split == 1) { 82 VLOG(1) << "Split identity"; 83 context->set_output(0, context->input(1)); 84 *done = true; 85 return; 86 } 87 88 // Special case 2: split along the 1st dimension. We can share the 89 // underlying buffer. 90 // 91 // Apply this optimization conservatively: if input is aligned, 92 // the resulting tensors must be aligned. It's conservative 93 // because if the immediate consumer of the resulting tensors are 94 // not using eigen for computation, its perfectly fine to avoid 95 // the copying. 96 if ((split_dim == 0) && IsInnerDimsSizeAligned<T>(input_shape)) { 97 VLOG(1) << "Slice dim 0: " << input_shape.DebugString(); 98 const int64 delta = input_shape.dim_size(0) / num_split; 99 for (int i = 0; i < num_split; ++i) { 100 context->set_output(i, input.Slice(i * delta, (i + 1) * delta)); 101 } 102 *done = true; 103 return; 104 } 105 } 106 107 template <typename IndexType> SetDims(const TensorShape & input_shape,int32 split_dim) const108 std::tuple<IndexType, IndexType, IndexType> SetDims( 109 const TensorShape& input_shape, int32 split_dim) const { 110 static_assert(std::is_integral<IndexType>::value, 111 "IndexType must be an integer type"); 112 int32 prefix_dim_size = 1; 113 for (int i = 0; i < split_dim; ++i) { 114 prefix_dim_size *= input_shape.dim_size(i); 115 } 116 117 // Caller must ensure that dim_size and suffix_dim_size are < 118 // std::numeric_limits<IndexType>::max() 119 IndexType split_dim_size = 120 static_cast<IndexType>(input_shape.dim_size(split_dim)); 121 122 IndexType suffix_dim_size = 1; 123 for (int i = split_dim + 1; i < input_shape.dims(); ++i) { 124 suffix_dim_size *= static_cast<IndexType>(input_shape.dim_size(i)); 125 } 126 return std::make_tuple(prefix_dim_size, split_dim_size, suffix_dim_size); 127 } 128 }; 129 130 template <typename T, typename InputReshapedType, int NDims> 131 class SplitOpCPUImpl { 132 public: 133 template <typename MakeSizesType, typename ReshapeResultType> operator ()(OpKernelContext * context,const InputReshapedType & input_reshaped,const TensorShape & input_shape,int32 split_dim,Eigen::DenseIndex prefix_dim_size,Eigen::DenseIndex split_dim_size,Eigen::DenseIndex suffix_dim_size,const MakeSizesType & make_sizes,const ReshapeResultType & reshape_result,int32 num_split,int64 split_dim_output_size) const134 void operator()(OpKernelContext* context, 135 const InputReshapedType& input_reshaped, 136 const TensorShape& input_shape, int32 split_dim, 137 Eigen::DenseIndex prefix_dim_size, 138 Eigen::DenseIndex split_dim_size, 139 Eigen::DenseIndex suffix_dim_size, 140 const MakeSizesType& make_sizes, 141 const ReshapeResultType& reshape_result, int32 num_split, 142 int64 split_dim_output_size) const { 143 const auto num_threads = 144 context->device()->tensorflow_cpu_worker_threads()->num_threads; 145 // TODO(jewillco): Tune heuristic further. 146 const auto input_element_count = input_shape.num_elements(); 147 const bool use_parallelism_between_outputs = 148 (num_split >= 4 && 149 input_element_count >= std::max(num_threads, num_split) * 4096 && 150 input_element_count < num_split * 180 * 1024); 151 Eigen::DSizes<Eigen::DenseIndex, NDims> indices; 152 for (int i = 0; i < NDims; ++i) { 153 indices[i] = 0; 154 } 155 auto sizes = make_sizes(split_dim_output_size); 156 TensorShape output_shape(input_shape); 157 output_shape.set_dim(split_dim, split_dim_output_size); 158 159 auto range_output_func = [&indices, context, &output_shape, prefix_dim_size, 160 split_dim_output_size, suffix_dim_size, &sizes, 161 use_parallelism_between_outputs, &input_reshaped, 162 &reshape_result](int64 start, int64 limit) { 163 for (int64 i = start; i < limit; ++i) { 164 Tensor* result = nullptr; 165 OP_REQUIRES_OK(context, 166 context->allocate_output(i, output_shape, &result)); 167 if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) { 168 Eigen::DSizes<Eigen::DenseIndex, NDims> slice_indices; 169 Eigen::DSizes<Eigen::DenseIndex, NDims> slice_sizes; 170 for (int j = 0; j < NDims; ++j) { 171 slice_indices[j] = 172 (j == NDims - 2 ? i * split_dim_output_size : indices[j]); 173 slice_sizes[j] = sizes[j]; 174 } 175 176 auto result_shaped = reshape_result(result, split_dim_output_size); 177 178 if (use_parallelism_between_outputs) { 179 // Use sequential implementation for single output. 180 result_shaped = input_reshaped.slice(slice_indices, slice_sizes); 181 } else { 182 // This implementation may be parallel internally. 183 functor::Split<CPUDevice, T, NDims>()( 184 context->eigen_device<CPUDevice>(), result_shaped, 185 input_reshaped, slice_indices, slice_sizes); 186 } 187 } 188 } 189 }; 190 if (use_parallelism_between_outputs) { 191 // Run in parallel, disabling parallelism in functor. 192 context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor( 193 num_split, input_element_count / num_split, range_output_func); 194 } else { 195 // Run sequentially, but allow internal parallelism in functor. 196 range_output_func(0, num_split); 197 } 198 } 199 }; 200 201 template <typename T> 202 class SplitOpCPU : public SplitOpBase<CPUDevice, T> { 203 public: 204 typedef SplitOpBase<CPUDevice, T> Base; SplitOpCPU(OpKernelConstruction * c)205 explicit SplitOpCPU(OpKernelConstruction* c) : Base(c) {} 206 Compute(OpKernelContext * context)207 void Compute(OpKernelContext* context) override { 208 bool done = false; 209 Base::ComputeEasyCases(context, &done); 210 if (!context->status().ok() || done) { 211 return; 212 } 213 const int32 num_split = Base::num_outputs(); 214 const Tensor& input = context->input(1); 215 const TensorShape& input_shape = input.shape(); 216 const int32 split_dim_orig = context->input(0).flat<int32>()(0); 217 const int32 split_dim = 218 split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; 219 220 // Android also uses int32 indexing, so check here also. 221 OP_REQUIRES( 222 context, 223 FastBoundsCheck(input.NumElements(), 224 std::numeric_limits<Eigen::DenseIndex>::max()), 225 errors::InvalidArgument("Split requires input size < ", 226 std::numeric_limits<Eigen::DenseIndex>::max())); 227 228 Eigen::DenseIndex prefix_dim_size; 229 Eigen::DenseIndex split_dim_size; 230 Eigen::DenseIndex suffix_dim_size; 231 232 std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) = 233 Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim); 234 235 const int64 split_dim_output_size = split_dim_size / num_split; 236 237 if (prefix_dim_size == 1) { 238 auto input_reshaped = 239 input.shaped<T, 2>({split_dim_size, suffix_dim_size}); 240 auto make_sizes = [&](Eigen::DenseIndex split_size) { 241 return Eigen::DSizes<Eigen::DenseIndex, 2>{split_size, suffix_dim_size}; 242 }; 243 auto reshape_result = [&](Tensor* result, Eigen::DenseIndex split_size) { 244 return result->shaped<T, 2>({split_size, suffix_dim_size}); 245 }; 246 SplitOpCPUImpl<T, decltype(input_reshaped), 2>{}( 247 context, input_reshaped, input_shape, split_dim, prefix_dim_size, 248 split_dim_size, suffix_dim_size, make_sizes, reshape_result, 249 num_split, split_dim_output_size); 250 } else { 251 auto input_reshaped = input.shaped<T, 3>( 252 {prefix_dim_size, split_dim_size, suffix_dim_size}); 253 auto make_sizes = [&](Eigen::DenseIndex split_size) { 254 return Eigen::DSizes<Eigen::DenseIndex, 3>{prefix_dim_size, split_size, 255 suffix_dim_size}; 256 }; 257 auto reshape_result = [&](Tensor* result, Eigen::DenseIndex split_size) { 258 return result->shaped<T, 3>( 259 {prefix_dim_size, split_size, suffix_dim_size}); 260 }; 261 SplitOpCPUImpl<T, decltype(input_reshaped), 3>{}( 262 context, input_reshaped, input_shape, split_dim, prefix_dim_size, 263 split_dim_size, suffix_dim_size, make_sizes, reshape_result, 264 num_split, split_dim_output_size); 265 } 266 } 267 }; 268 269 #if GOOGLE_CUDA 270 271 // Partial specialization for GPU 272 template <typename T> 273 class SplitOpGPU : public SplitOpBase<GPUDevice, T> { 274 public: 275 typedef SplitOpBase<GPUDevice, T> Base; SplitOpGPU(OpKernelConstruction * c)276 explicit SplitOpGPU(OpKernelConstruction* c) : Base(c) {} 277 Compute(OpKernelContext * context)278 void Compute(OpKernelContext* context) override { 279 bool done = false; 280 Base::ComputeEasyCases(context, &done); 281 if (!context->status().ok() || done) { 282 return; 283 } 284 const Tensor& input = context->input(1); 285 const TensorShape& input_shape = input.shape(); 286 const int32 split_dim_orig = context->input(0).flat<int32>()(0); 287 const int32 split_dim = 288 split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; 289 const int32 num_split = Base::num_outputs(); 290 OP_REQUIRES( 291 context, 292 FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()), 293 errors::InvalidArgument("Split on GPU requires input size " 294 "< max int32")); 295 int32 prefix_dim_size; 296 int32 split_dim_size; 297 int32 suffix_dim_size; 298 std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) = 299 Base::template SetDims<int32>(input_shape, split_dim); 300 301 const int32 split_dim_output_size = split_dim_size / num_split; 302 TensorShape output_shape(input_shape); 303 output_shape.set_dim(split_dim, split_dim_output_size); 304 305 GpuDeviceArrayOnHost<T*> ptrs(context, num_split); 306 OP_REQUIRES_OK(context, ptrs.Init()); 307 308 for (int i = 0; i < num_split; ++i) { 309 Tensor* result = nullptr; 310 OP_REQUIRES_OK(context, 311 context->allocate_output(i, output_shape, &result)); 312 ptrs.Set(i, result->flat<T>().data()); 313 } 314 if (prefix_dim_size * split_dim_output_size * suffix_dim_size == 0) { 315 return; 316 } 317 OP_REQUIRES_OK(context, ptrs.Finalize()); 318 319 SplitOpGPULaunch<T>().Run(context->eigen_device<GPUDevice>(), 320 input.flat<T>().data(), prefix_dim_size, 321 split_dim_size, suffix_dim_size, ptrs.data()); 322 OP_REQUIRES(context, context->op_device_context()->stream()->ok(), 323 errors::Internal("Launch of gpu kernel for SplitOp failed")); 324 } 325 }; 326 #endif // GOOGLE_CUDA 327 328 #ifdef TENSORFLOW_USE_SYCL 329 template <typename T> 330 class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> { 331 public: 332 typedef SplitOpBase<SYCLDevice, T> Base; SplitOpSYCL(OpKernelConstruction * c)333 explicit SplitOpSYCL(OpKernelConstruction* c) : Base(c) {} 334 Compute(OpKernelContext * context)335 void Compute(OpKernelContext* context) override { 336 bool done = false; 337 Base::ComputeEasyCases(context, &done); 338 if (!context->status().ok() || done) { 339 return; 340 } 341 const Tensor& input = context->input(1); 342 const TensorShape& input_shape = input.shape(); 343 const int32 split_dim_orig = context->input(0).flat<int32>()(0); 344 const int32 split_dim = 345 split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; 346 const int32 num_split = Base::num_outputs(); 347 348 // Android also uses int32 indexing, so check here also. 349 OP_REQUIRES( 350 context, 351 FastBoundsCheck(input.NumElements(), 352 std::numeric_limits<Eigen::DenseIndex>::max()), 353 errors::InvalidArgument("Split requires input size < ", 354 std::numeric_limits<Eigen::DenseIndex>::max())); 355 356 Eigen::DenseIndex prefix_dim_size; 357 Eigen::DenseIndex split_dim_size; 358 Eigen::DenseIndex suffix_dim_size; 359 360 std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) = 361 Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim); 362 auto input_reshaped = 363 input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size}); 364 365 const int64 split_dim_output_size = split_dim_size / num_split; 366 TensorShape output_shape(input_shape); 367 output_shape.set_dim(split_dim, split_dim_output_size); 368 369 Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0}; 370 Eigen::DSizes<Eigen::DenseIndex, 3> sizes{ 371 prefix_dim_size, split_dim_output_size, suffix_dim_size}; 372 373 for (int i = 0; i < num_split; ++i) { 374 Tensor* result = nullptr; 375 OP_REQUIRES_OK(context, 376 context->allocate_output(i, output_shape, &result)); 377 if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) { 378 Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices; 379 Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes; 380 for (int j = 0; j < 3; ++j) { 381 slice_indices[j] = indices[j]; 382 slice_sizes[j] = sizes[j]; 383 } 384 385 auto result_shaped = result->shaped<T, 3>( 386 {prefix_dim_size, split_dim_output_size, suffix_dim_size}); 387 388 functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(), 389 result_shaped, input_reshaped, 390 slice_indices, slice_sizes); 391 } 392 indices[1] += split_dim_output_size; 393 } 394 } 395 }; 396 #endif // TENSORFLOW_USE_SYCL 397 398 #define REGISTER_SPLIT(type) \ 399 REGISTER_KERNEL_BUILDER(Name("Split") \ 400 .Device(DEVICE_CPU) \ 401 .TypeConstraint<type>("T") \ 402 .HostMemory("split_dim"), \ 403 SplitOpCPU<type>) 404 405 TF_CALL_ALL_TYPES(REGISTER_SPLIT); 406 REGISTER_SPLIT(quint8); 407 408 #undef REGISTER_SPLIT 409 410 #if GOOGLE_CUDA 411 412 #define REGISTER_GPU(type) \ 413 REGISTER_KERNEL_BUILDER(Name("Split") \ 414 .Device(DEVICE_GPU) \ 415 .TypeConstraint<type>("T") \ 416 .HostMemory("split_dim"), \ 417 SplitOpGPU<type>) 418 419 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); 420 TF_CALL_complex64(REGISTER_GPU); 421 TF_CALL_complex128(REGISTER_GPU); 422 REGISTER_GPU(bfloat16); 423 #undef REGISTER_GPU 424 425 #endif // GOOGLE_CUDA 426 427 #ifdef TENSORFLOW_USE_SYCL 428 #define REGISTER_SYCL(type) \ 429 REGISTER_KERNEL_BUILDER(Name("Split") \ 430 .Device(DEVICE_SYCL) \ 431 .TypeConstraint<type>("T") \ 432 .HostMemory("split_dim"), \ 433 SplitOpSYCL<type>) 434 435 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); 436 #undef REGISTER_SYCL 437 438 #endif // TENSORFLOW_USE_SYCL 439 440 } // end namespace tensorflow 441