1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #define EIGEN_USE_THREADS 17 18 #include <algorithm> 19 #include <cmath> 20 #include <type_traits> 21 22 #include "tensorflow/core/framework/bounds_check.h" 23 #include "tensorflow/core/framework/numeric_op.h" 24 #include "tensorflow/core/framework/op_kernel.h" 25 #include "tensorflow/core/framework/register_types.h" 26 #include "tensorflow/core/framework/tensor.h" 27 #include "tensorflow/core/framework/tensor_shape.h" 28 #include "tensorflow/core/framework/tensor_types.h" 29 #include "tensorflow/core/framework/types.h" 30 #include "tensorflow/core/kernels/conv_ops.h" 31 #include "tensorflow/core/kernels/depthwise_conv_op.h" 32 #include "tensorflow/core/kernels/ops_util.h" 33 #include "tensorflow/core/lib/core/status.h" 34 #include "tensorflow/core/platform/logging.h" 35 #include "tensorflow/core/platform/types.h" 36 #include "tensorflow/core/util/padding.h" 37 #include "tensorflow/core/util/tensor_format.h" 38 #include "tensorflow/core/util/use_cudnn.h" 39 #include "tensorflow/core/util/work_sharder.h" 40 41 #if GOOGLE_CUDA 42 #include "cuda/include/cudnn.h" 43 #include "tensorflow/core/platform/stream_executor.h" 44 #endif // GOOGLE_CUDA 45 46 namespace tensorflow { 47 48 // In depthwise convolution, one input is convolved into depth_multipler 49 // outputs and the outputs don't need to be reduced again like what regular 50 // convolution does. 51 // However, the way to apply filters to inputs is exactly the same as the 52 // regular convolution. Please refer to the regular convolution kernels for 53 // more details. 54 55 typedef Eigen::ThreadPoolDevice CPUDevice; 56 typedef Eigen::GpuDevice GPUDevice; 57 58 // Computes the vectorized product of 'input_buffer' and 'filter' and stores 59 // result in 'output' at location specified by 'out_r' and 'out_c'. 60 // 61 // EX: 62 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 63 // Both 'input_buffer' and 'filter' are padded to register-width boundaries. 64 // 65 // input_buffer [rows, cols, in_depth, depth_multiplier] 66 // [a0, a0, a1, a1] [a2, a2, 0, 0] [b0, b0, b1, b1] [b2, b2, 0, 0] 67 // [e0, e0, e1, e1] [e2, e2, 0, 0] [f0, f0, f1, f1] [f2, f2, 0, 0] 68 // 69 // filter [rows, cols, in_depth, depth_multiplier] 70 // [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0] 71 // [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0] 72 // 73 // First output register [in_depth, depth_multiplier] 74 // [q0, q1, q2, q3] = ([a0, a0, a1, a1] x [u0, v0, w0, x0]) + 75 // ([b0, b0, b1, b1] x [u1, v1, w1, x1]) + 76 // ([e0, e0, e1, e1] x [u2, v2, w2, x2]) + 77 // ([f0, f0, f1, f1] x [u3, v3, w3, x3]) 78 // 79 // TODO(andydavis) Experiment with processing multiple inputs per input buffer. 80 template <typename T> 81 struct DepthwiseConv2DKernel { Runtensorflow::DepthwiseConv2DKernel82 static void Run(const DepthwiseArgs& args, 83 const int64 padded_filter_inner_dim_size, const int64 out_r, 84 const int64 out_c, const T* filter, const T* input_buffer, 85 T* output, TensorFormat data_format) { 86 typedef typename Eigen::internal::packet_traits<T>::type Packet; 87 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 88 89 const int64 out_depth = args.out_depth; 90 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 91 const int64 output_scalar_size = out_depth % kPacketSize; 92 const int64 output_vectorized_size = 93 (out_depth / kPacketSize) * kPacketSize; 94 const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth; 95 96 for (int i = 0; i < output_vectorized_size; i += kPacketSize) { 97 // Reset accumulator. 98 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0)); 99 for (int j = 0; j < filter_spatial_size; ++j) { 100 // Calculate index. 101 const int64 index = i + j * padded_filter_inner_dim_size; 102 // Load filter. 103 // TODO(andydavis) Unroll 'out_c' loop in caller so we can load 104 // multiple inputs here to amortize the cost of each filter block load. 105 const auto filter_block = 106 Eigen::internal::ploadu<Packet>(filter + index); 107 // Load input. 108 const auto data_block = 109 Eigen::internal::ploadu<Packet>(input_buffer + index); 110 // Vector multiply-add. 111 vaccum = 112 Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum); 113 } 114 // Store vector accumulator to output. 115 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum); 116 } 117 118 if (output_scalar_size > 0) { 119 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0)); 120 for (int j = 0; j < filter_spatial_size; ++j) { 121 const int64 index = 122 output_vectorized_size + j * padded_filter_inner_dim_size; 123 const auto filter_block = 124 Eigen::internal::ploadu<Packet>(filter + index); 125 const auto data_block = 126 Eigen::internal::ploadu<Packet>(input_buffer + index); 127 vaccum = 128 Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum); 129 } 130 // Load accumulator into an array and loop through output. 131 T out_buf[kPacketSize]; 132 Eigen::internal::pstoreu<T>(out_buf, vaccum); 133 const int64 last_output_index = 134 base_output_index + output_vectorized_size; 135 for (int j = 0; j < output_scalar_size; ++j) { 136 output[last_output_index + j] = out_buf[j]; 137 } 138 } 139 } 140 }; 141 142 // Computes the depthwise conv2d of 'input' by 'depthwise_filter' and stores 143 // the result in 'output'. This implementation trades off copying small patches 144 // of the input to achieve better data alignment, which enables vectorized 145 // load/store and multiply-add operations (see comments at InputBufferCopyOp and 146 // DepthwiseConv2DKernel for details). 147 // 148 // TODO(andydavis) Evaluate the performance of processing multiple input 149 // patches in the inner loop. 150 // TODO(andydavis) Consider a zero-copy implementation for the case when 151 // 'in_depth' is a multiple of register width, and 'depth_multipler' is one. 152 // TODO(andydavis) Evaluate the performance of alternative implementations. 153 template <typename T> 154 struct LaunchDepthwiseConvOp<CPUDevice, T> { 155 typedef typename Eigen::internal::packet_traits<T>::type Packet; 156 operator ()tensorflow::LaunchDepthwiseConvOp157 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, 158 const T* input, const T* depthwise_filter, T* output, 159 TensorFormat data_format) { 160 OP_REQUIRES( 161 ctx, data_format == FORMAT_NHWC, 162 errors::Unimplemented( 163 "Depthwise convolution on CPU is only supported for NHWC format")); 164 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 165 166 // Pad 'depthwise_filter' to vector register width (if needed). 167 const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true; 168 Tensor padded_filter; 169 if (pad_filter) { 170 // Allocate space for padded filter. 171 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 172 const int64 padded_filter_inner_dim_size = 173 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize; 174 OP_REQUIRES_OK( 175 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, 176 TensorShape({filter_spatial_size, 177 padded_filter_inner_dim_size}), 178 &padded_filter)); 179 // Write out padded filter. 180 functor::DepthwiseFilterPadOp<T>()( 181 args, depthwise_filter, padded_filter.template flat<T>().data()); 182 } 183 const T* filter_data = 184 pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter; 185 186 // Computes one shard of depthwise conv2d output. 187 auto shard = [&ctx, &args, &input, &filter_data, &output, data_format]( 188 int64 start, int64 limit) { 189 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 190 const int64 input_image_size = 191 args.in_rows * args.in_cols * args.in_depth; 192 const int64 output_image_size = 193 args.out_rows * args.out_cols * args.out_depth; 194 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 195 const int64 padded_filter_inner_dim_size = 196 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize; 197 198 // Allocate buffer for local input regions. 199 Tensor input_buffer; 200 OP_REQUIRES_OK( 201 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, 202 TensorShape({filter_spatial_size, 203 padded_filter_inner_dim_size}), 204 &input_buffer)); 205 T* input_buffer_data = input_buffer.template flat<T>().data(); 206 207 for (int64 i = start; i < limit; ++i) { 208 const int64 b = i / args.out_rows; 209 const int64 in_base = b * input_image_size; 210 const int64 out_base = b * output_image_size; 211 212 const int64 out_r = i % args.out_rows; 213 214 for (int64 out_c = 0; out_c < args.out_cols; ++out_c) { 215 // Populate 'input_buffer_data' with data from local input region. 216 functor::DepthwiseInputCopyOp<T>()(args, padded_filter_inner_dim_size, 217 out_r, out_c, input + in_base, 218 input_buffer_data); 219 220 // Process buffered input across all filters and store to output. 221 DepthwiseConv2DKernel<T>::Run( 222 args, padded_filter_inner_dim_size, out_r, out_c, filter_data, 223 input_buffer_data, output + out_base, data_format); 224 } 225 } 226 }; 227 228 const int64 total_shards = args.batch * args.out_rows; 229 230 // Empirically tested to give reasonable performance boosts at batch size 1 231 // without reducing throughput at batch size 32. 232 const float kCostMultiplier = 2.5f; 233 234 // TODO(andydavis): Estimate shard cost (in cycles) based on the number of 235 // flops/loads/stores required to compute one shard. 236 const int64 shard_cost = kCostMultiplier * args.out_cols * args.out_depth; 237 238 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); 239 Shard(worker_threads.num_threads, worker_threads.workers, total_shards, 240 shard_cost, shard); 241 } 242 }; 243 244 // Extern template instantiated in conv_ops.cc. 245 extern template struct LaunchConv2DOp<CPUDevice, Eigen::half>; 246 extern template struct LaunchConv2DOp<CPUDevice, float>; 247 extern template struct LaunchConv2DOp<CPUDevice, double>; 248 249 #if GOOGLE_CUDA 250 251 // Extern template instantiated in conv_ops.cc. 252 extern template struct LaunchConv2DOp<GPUDevice, Eigen::half>; 253 extern template struct LaunchConv2DOp<GPUDevice, float>; 254 extern template struct LaunchConv2DOp<GPUDevice, double>; 255 256 // Extern template instantiated in depthwise_conv_op_gpu.cc. 257 extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>; 258 extern template struct LaunchDepthwiseConvOp<GPUDevice, float>; 259 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>; 260 261 #endif 262 263 template <typename Device, typename T> 264 class DepthwiseConv2dNativeOp : public BinaryOp<T> { 265 public: DepthwiseConv2dNativeOp(OpKernelConstruction * context)266 explicit DepthwiseConv2dNativeOp(OpKernelConstruction* context) 267 : BinaryOp<T>(context) { 268 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 269 string data_format; 270 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 271 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 272 errors::InvalidArgument("Invalid data format")); 273 274 OP_REQUIRES(context, strides_.size() == 4, 275 errors::InvalidArgument("Sliding window strides field must " 276 "specify 4 dimensions")); 277 stride_ = GetTensorDim(strides_, data_format_, 'H'); 278 const int64 stride_w = GetTensorDim(strides_, data_format_, 'W'); 279 const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); 280 const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); 281 282 OP_REQUIRES(context, stride_ == stride_w, 283 errors::InvalidArgument( 284 "Current implementation only supports equal length " 285 "strides in the row and column dimensions.")); 286 OP_REQUIRES( 287 context, (stride_n == 1 && stride_c == 1), 288 errors::InvalidArgument("Current implementation does not yet support " 289 "strides in the batch and depth dimensions.")); 290 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 291 292 // For in_depth == 1 and grouped convolutions. 293 use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value; 294 cudnn_use_autotune_ = CudnnUseAutotune(); 295 use_cudnn_grouped_conv_ = false; 296 dtype_ = DataTypeToEnum<T>::value; 297 } 298 Compute(OpKernelContext * context)299 void Compute(OpKernelContext* context) override { 300 // Input tensor is of the following dimensions: 301 // [ batch, in_rows, in_cols, in_depth ] 302 const Tensor& input = context->input(0); 303 304 // Input filter is of the following dimensions: 305 // [ filter_rows, filter_cols, in_depth, depth_multiplier] 306 const Tensor& filter = context->input(1); 307 308 // For 2D convolution, there should be 4 dimensions. 309 OP_REQUIRES(context, input.dims() == 4, 310 errors::InvalidArgument("input must be 4-dimensional", 311 input.shape().DebugString())); 312 OP_REQUIRES(context, filter.dims() == 4, 313 errors::InvalidArgument("filter must be 4-dimensional: ", 314 filter.shape().DebugString())); 315 316 // in_depth for input and filter must match. 317 const int64 in_depth = GetTensorDim(input, data_format_, 'C'); 318 OP_REQUIRES(context, in_depth == filter.dim_size(2), 319 errors::InvalidArgument( 320 "input and filter must have the same depth: ", in_depth, 321 " vs ", filter.dim_size(2))); 322 323 // The last dimension for filter is depth multiplier. 324 const int32 depth_multiplier = filter.dim_size(3); 325 326 // The output depth is input depth x depth multipler 327 const int32 out_depth = in_depth * depth_multiplier; 328 329 const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); 330 OP_REQUIRES( 331 context, 332 FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()), 333 errors::InvalidArgument("Input rows too large")); 334 const int32 input_rows = static_cast<int32>(input_rows_raw); 335 const int32 filter_rows = filter.dim_size(0); 336 337 const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); 338 OP_REQUIRES( 339 context, 340 FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()), 341 errors::InvalidArgument("Input cols too large")); 342 const int32 input_cols = static_cast<int32>(input_cols_raw); 343 const int32 filter_cols = filter.dim_size(1); 344 345 // The first dimension for input is batch. 346 const int32 batch = input.dim_size(0); 347 348 int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; 349 OP_REQUIRES_OK(context, 350 GetWindowedOutputSize(input_rows, filter_rows, stride_, 351 padding_, &out_rows, &pad_rows)); 352 OP_REQUIRES_OK(context, 353 GetWindowedOutputSize(input_cols, filter_cols, stride_, 354 padding_, &out_cols, &pad_cols)); 355 TensorShape out_shape = 356 ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth); 357 OP_REQUIRES( 358 context, 359 (!std::is_same<Device, GPUDevice>::value || 360 FastBoundsCheck(out_shape.num_elements(), 361 std::numeric_limits<int32>::max())), 362 errors::InvalidArgument("Output elements too large for GPU kernel")); 363 364 Tensor* output = nullptr; 365 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 366 367 // If there is nothing to compute, return. 368 if (out_shape.num_elements() == 0) { 369 return; 370 } 371 372 // TODO(csigg): Have autotune decide if native is faster than cuDNN. 373 // If in_depth==1, this operation is just a standard convolution. 374 // Depthwise convolution is a special case of cuDNN's grouped convolution. 375 bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_); 376 377 VLOG(2) << "DepthwiseConv2dNative: " 378 << " Input: [" << batch << ", " << input_rows << ", " << input_cols 379 << ", " << in_depth << "]; Filter: [" << filter_rows << ", " 380 << filter_cols << ", " << in_depth << ", " << depth_multiplier 381 << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols 382 << ", " << out_depth << "], stride = " << stride_ 383 << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols 384 << ", Use cuDNN: " << use_cudnn; 385 386 if (use_cudnn) { 387 // Reshape from TF depthwise filter to cuDNN grouped convolution filter: 388 // 389 // | TensorFlow | cuDNN 390 // -------------------------------------------------------------------- 391 // filter_out_depth | depth_multiplier | depth_multiplier * group_count 392 // filter_in_depth | in_depth | in_depth / group_count 393 // 394 // For depthwise convolution, we have group_count == in_depth. 395 int32 filter_in_depth = 1; 396 TensorShape shape = 397 TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth}; 398 Tensor reshaped_filter(/*type=*/dtype_); 399 OP_REQUIRES( 400 context, reshaped_filter.CopyFrom(filter, shape), 401 errors::Internal( 402 "Failed to reshape filter tensor for grouped convolution.")); 403 // TODO(yangzihao): Send in arbitrary dilation rates after the dilated 404 // conv is supported. 405 launcher_(context, use_cudnn_, cudnn_use_autotune_, input, 406 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1, 407 stride_, stride_, padding_, /*explicit_paddings=*/{}, output, 408 data_format_); 409 return; 410 } 411 412 DepthwiseArgs args; 413 args.batch = batch; 414 args.in_rows = input_rows; 415 args.in_cols = input_cols; 416 args.in_depth = in_depth; 417 args.filter_rows = filter_rows; 418 args.filter_cols = filter_cols; 419 args.depth_multiplier = depth_multiplier; 420 args.stride = stride_; 421 args.pad_rows = pad_rows; 422 args.pad_cols = pad_cols; 423 args.out_rows = out_rows; 424 args.out_cols = out_cols; 425 args.out_depth = out_depth; 426 427 auto input_ptr = input.template flat<T>().data(); 428 auto filter_ptr = filter.template flat<T>().data(); 429 auto output_ptr = output->template flat<T>().data(); 430 LaunchDepthwiseConvOp<Device, T>()(context, args, input_ptr, filter_ptr, 431 output_ptr, data_format_); 432 } 433 434 protected: 435 bool use_cudnn_grouped_conv_; 436 437 private: 438 std::vector<int32> strides_; 439 Padding padding_; 440 TensorFormat data_format_; 441 442 int64 stride_; // in height/width dimension. 443 444 // For in_depth == 1 and grouped convolutions. 445 LaunchConv2DOp<Device, T> launcher_; 446 bool use_cudnn_; 447 bool cudnn_use_autotune_; 448 DataType dtype_; 449 450 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp); 451 }; 452 453 #define REGISTER_CPU_KERNEL(T) \ 454 REGISTER_KERNEL_BUILDER( \ 455 Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 456 DepthwiseConv2dNativeOp<CPUDevice, T>) 457 458 TF_CALL_half(REGISTER_CPU_KERNEL); 459 TF_CALL_float(REGISTER_CPU_KERNEL); 460 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG) 461 TF_CALL_double(REGISTER_CPU_KERNEL); 462 #endif 463 464 #if GOOGLE_CUDA 465 466 #define REGISTER_GPU_KERNEL(T) \ 467 REGISTER_KERNEL_BUILDER( \ 468 Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ 469 DepthwiseConv2dNativeOp<GPUDevice, T>) 470 471 TF_CALL_half(REGISTER_GPU_KERNEL); 472 TF_CALL_float(REGISTER_GPU_KERNEL); 473 TF_CALL_double(REGISTER_GPU_KERNEL); 474 475 #if CUDNN_VERSION >= 7000 476 template <typename T> 477 class DepthwiseConv2dGroupedConvOp 478 : public DepthwiseConv2dNativeOp<GPUDevice, T> { 479 public: DepthwiseConv2dGroupedConvOp(OpKernelConstruction * context)480 DepthwiseConv2dGroupedConvOp(OpKernelConstruction* context) 481 : DepthwiseConv2dNativeOp<GPUDevice, T>(context) { 482 this->use_cudnn_grouped_conv_ = true; 483 } 484 }; 485 486 #define REGISTER_GROUPED_CONV_KERNEL(T) \ 487 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative") \ 488 .Device(DEVICE_GPU) \ 489 .TypeConstraint<T>("T") \ 490 .Label("cudnn_grouped_convolution"), \ 491 DepthwiseConv2dGroupedConvOp<T>) 492 493 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL); 494 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL); 495 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL); 496 #endif // CUDNN_VERSION 497 #endif // GOOGLE_CUDA 498 499 } // namespace tensorflow 500