1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // See docs in ../ops/nn_ops.cc.
17
18 #define USE_EIGEN_TENSOR
19 #define EIGEN_USE_THREADS
20
21 #if GOOGLE_CUDA
22 #define EIGEN_USE_GPU
23 #endif // GOOGLE_CUDA
24
25 #include "tensorflow/core/kernels/conv_ops.h"
26
27 #include <string.h>
28 #include <map>
29 #include <vector>
30
31 #include "tensorflow/core/framework/bounds_check.h"
32 #include "tensorflow/core/framework/numeric_op.h"
33 #include "tensorflow/core/framework/op_kernel.h"
34 #include "tensorflow/core/framework/register_types.h"
35 #include "tensorflow/core/framework/tensor.h"
36 #include "tensorflow/core/framework/tensor_shape.h"
37 #include "tensorflow/core/framework/tensor_slice.h"
38 #include "tensorflow/core/kernels/conv_2d.h"
39 #include "tensorflow/core/kernels/deep_conv2d.h"
40 #include "tensorflow/core/kernels/ops_util.h"
41 #include "tensorflow/core/lib/core/errors.h"
42 #include "tensorflow/core/lib/gtl/array_slice.h"
43 #include "tensorflow/core/lib/strings/numbers.h"
44 #include "tensorflow/core/lib/strings/str_util.h"
45 #include "tensorflow/core/platform/logging.h"
46 #include "tensorflow/core/platform/macros.h"
47 #include "tensorflow/core/util/padding.h"
48 #include "tensorflow/core/util/tensor_format.h"
49 #include "tensorflow/core/util/use_cudnn.h"
50
51 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
52 #include "tensorflow/core/kernels/xsmm_conv2d.h"
53 #endif
54
55 #if GOOGLE_CUDA
56 #include "tensorflow/core/kernels/conv_ops_gpu.h"
57 #include "tensorflow/core/platform/stream_executor.h"
58 #include "tensorflow/core/protobuf/autotuning.pb.h"
59 #include "tensorflow/core/util/proto/proto_utils.h"
60 #endif // GOOGLE_CUDA
61
62 namespace tensorflow {
63
64 typedef Eigen::ThreadPoolDevice CPUDevice;
65 typedef Eigen::GpuDevice GPUDevice;
66
67 namespace {
68 template <typename Device, typename T>
69 struct LaunchGeneric {
operator ()tensorflow::__anonfb357e540111::LaunchGeneric70 void operator()(OpKernelContext* ctx, const Tensor& input,
71 const Tensor& filter, int row_stride, int col_stride,
72 int row_dilation, int col_dilation, const Padding& padding,
73 Tensor* output, TensorFormat data_format) {
74 CHECK(data_format == FORMAT_NHWC) << "Generic conv implementation only "
75 "supports NHWC tensor format for now.";
76 if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
77 col_stride == 1) {
78 // For 1x1 kernel, the 2D convolution is reduced to matrix
79 // multiplication.
80 //
81 // TODO(vrv): We should be able to call SpatialConvolution
82 // and it will produce the same result, but doing so
83 // led to NaNs during training. Using matmul instead for now.
84 int conv_width = 1; // Width for the convolution step.
85 for (int i = 0; i < 3; ++i) {
86 conv_width *= output->dim_size(i);
87 }
88
89 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
90 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
91 functor::MatMulConvFunctor<Device, T>()(
92 ctx->eigen_device<Device>(),
93 output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
94 input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
95 filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
96 dim_pair);
97 } else if (filter.dim_size(0) == input.dim_size(1) &&
98 filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
99 col_dilation == 1 && padding == VALID) {
100 // If the input data and filter have the same height/width,
101 // the 2D convolution is reduced to matrix multiplication.
102 const int k = // Length of reduction dimension.
103 filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
104
105 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
106 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
107 functor::MatMulConvFunctor<Device, T>()(
108 ctx->eigen_device<Device>(),
109 output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
110 input.shaped<T, 2>({input.dim_size(0), k}),
111 filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair);
112 } else {
113 functor::SpatialConvolution<Device, T>()(
114 ctx->eigen_device<Device>(), output->tensor<T, 4>(),
115 input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
116 row_dilation, col_dilation, BrainPadding2EigenPadding(padding));
117 }
118 }
119 };
120 } // namespace
121
122 template <typename T>
123 struct LaunchConv2DOp<CPUDevice, T> {
operator ()tensorflow::LaunchConv2DOp124 void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
125 const Tensor& input, const Tensor& filter, int row_dilation,
126 int col_dilation, int row_stride, int col_stride,
127 const Padding& padding,
128 const std::vector<int64>& explicit_paddings, Tensor* output,
129 TensorFormat data_format) {
130 if (data_format != FORMAT_NHWC) {
131 ctx->SetStatus(
132 errors::Unimplemented("Generic conv implementation only supports "
133 "NHWC tensor format for now."));
134 return;
135 }
136 // TODO(reedwm): Enable explicit padding on the CPU.
137 OP_REQUIRES(
138 ctx, padding != Padding::EXPLICIT,
139 errors::Unimplemented("Generic conv implementation does not support "
140 "EXPLICIT padding yet."));
141 const int64 in_depth = GetTensorDim(input, data_format, 'C');
142 OP_REQUIRES(ctx, in_depth == filter.dim_size(2),
143 errors::Unimplemented("Generic conv implementation does not "
144 "support grouped convolutions for now."));
145 LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride,
146 row_dilation, col_dilation, padding, output,
147 data_format);
148 }
149 };
150
151 template <typename Device, typename T>
152 class LaunchDeepConvOp {
153 public:
Run(OpKernelContext * ctx,const Tensor & input,const Tensor & filter,int batch,int input_rows,int input_cols,int in_depth,int filter_rows,int filter_cols,int pad_rows,int pad_cols,int out_rows,int,int,int,int,int,int,Tensor *,TensorFormat)154 static bool Run(OpKernelContext* ctx, const Tensor& input,
155 const Tensor& filter, int batch, int input_rows,
156 int input_cols, int in_depth, int filter_rows,
157 int filter_cols, int pad_rows, int pad_cols, int out_rows,
158 int /*out_cols*/, int /*out_depth*/, int /*dilation_rows*/,
159 int /*dilation_cols*/, int /*stride_rows*/,
160 int /*stride_cols*/, Tensor* /*output*/,
161 TensorFormat /*data_format*/) {
162 return false;
163 }
164 };
165
166 // Conditionally launches DeepConv operation based on convolution parameters.
167 template <>
168 class LaunchDeepConvOp<CPUDevice, float> {
169 public:
Run(OpKernelContext * ctx,const Tensor & input,const Tensor & filter,int batch,int input_rows,int input_cols,int in_depth,int filter_rows,int filter_cols,int pad_rows,int pad_cols,int out_rows,int out_cols,int out_depth,int dilation_rows,int dilation_cols,int stride_rows,int stride_cols,Tensor * output,TensorFormat data_format)170 static bool Run(OpKernelContext* ctx, const Tensor& input,
171 const Tensor& filter, int batch, int input_rows,
172 int input_cols, int in_depth, int filter_rows,
173 int filter_cols, int pad_rows, int pad_cols, int out_rows,
174 int out_cols, int out_depth, int dilation_rows,
175 int dilation_cols, int stride_rows, int stride_cols,
176 Tensor* output, TensorFormat data_format) {
177 if (data_format != FORMAT_NHWC || dilation_rows != 1 ||
178 dilation_cols != 1 ||
179 !CanUseDeepConv2D(stride_rows, stride_cols, filter_rows, filter_cols,
180 in_depth, out_depth, out_rows, out_cols)) {
181 return false;
182 }
183
184 Conv2DArgs args;
185 args.batch = batch;
186 args.in_rows = input_rows;
187 args.in_cols = input_cols;
188 args.in_depth = in_depth;
189 args.filter_rows = filter_rows;
190 args.filter_cols = filter_cols;
191 args.pad_rows = pad_rows;
192 args.pad_cols = pad_cols;
193 args.out_rows = out_rows;
194 args.out_cols = out_cols;
195 args.out_depth = out_depth;
196
197 auto input_ptr = input.template flat<float>().data();
198 auto filter_ptr = filter.template flat<float>().data();
199 auto output_ptr = output->template flat<float>().data();
200
201 functor::DeepConv2D<CPUDevice, float>()(ctx, args, input_ptr, filter_ptr,
202 output_ptr);
203 return true;
204 }
205 };
206
207 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
208 template <typename Device, typename T>
209 class LaunchXsmmConvOp {
210 public:
Run(OpKernelContext * ctx,const Tensor & input,const Tensor & filter,int batch,int input_rows,int input_cols,int in_depth,int filter_rows,int filter_cols,int pad_rows,int pad_cols,int out_rows,int out_cols,int out_depth,int stride_rows,int stride_cols,int dilation_rows,int dilation_cols,Tensor * output,TensorFormat data_format)211 static bool Run(OpKernelContext* ctx, const Tensor& input,
212 const Tensor& filter, int batch, int input_rows,
213 int input_cols, int in_depth, int filter_rows,
214 int filter_cols, int pad_rows, int pad_cols, int out_rows,
215 int out_cols, int out_depth, int stride_rows, int stride_cols,
216 int dilation_rows, int dilation_cols, Tensor* output,
217 TensorFormat data_format) {
218 return false;
219 }
220 };
221
222 template <>
223 class LaunchXsmmConvOp<CPUDevice, float> {
224 public:
Run(OpKernelContext * ctx,const Tensor & input,const Tensor & filter,int batch,int input_rows,int input_cols,int in_depth,int filter_rows,int filter_cols,int pad_rows,int pad_cols,int out_rows,int out_cols,int out_depth,int dilation_rows,int dilation_cols,int stride_rows,int stride_cols,Tensor * output,TensorFormat data_format)225 static bool Run(OpKernelContext* ctx, const Tensor& input,
226 const Tensor& filter, int batch, int input_rows,
227 int input_cols, int in_depth, int filter_rows,
228 int filter_cols, int pad_rows, int pad_cols, int out_rows,
229 int out_cols, int out_depth, int dilation_rows,
230 int dilation_cols, int stride_rows, int stride_cols,
231 Tensor* output, TensorFormat data_format) {
232 auto num_threads =
233 ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
234 // See libxsmm_dnn.h for this struct definition.
235 libxsmm_dnn_conv_desc desc;
236 desc.N = batch;
237 desc.C = in_depth;
238 desc.H = input_rows;
239 desc.W = input_cols;
240 desc.K = out_depth;
241 desc.R = filter_rows;
242 desc.S = filter_cols;
243 desc.u = stride_rows;
244 desc.v = stride_cols;
245 desc.pad_h = pad_rows;
246 desc.pad_w = pad_cols;
247 desc.pad_h_in = 0;
248 desc.pad_w_in = 0;
249 desc.pad_h_out = 0;
250 desc.pad_w_out = 0;
251 desc.threads = num_threads;
252 desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
253 desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
254 desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
255 desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
256 desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE_OVERWRITE;
257 desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
258
259 if (dilation_rows != 1 || dilation_cols != 1 ||
260 !CanUseXsmmConv2D(desc, data_format)) {
261 return false;
262 }
263
264 auto input_ptr = input.template flat<float>().data();
265 auto filter_ptr = filter.template flat<float>().data();
266 auto output_ptr = output->template flat<float>().data();
267
268 bool success = functor::XsmmFwdConv2D<CPUDevice, float>()(
269 ctx, desc, input_ptr, filter_ptr, output_ptr);
270 return success;
271 }
272 };
273 #endif
274
275 #define TF_REQUIRES(EXP, STATUS) \
276 do { \
277 if (!TF_PREDICT_TRUE(EXP)) return (STATUS); \
278 } while (false)
279
InitConv2DParameters(const OpKernelConstruction * context,Conv2DParameters * params)280 Status InitConv2DParameters(const OpKernelConstruction* context,
281 Conv2DParameters* params) {
282 TF_RETURN_IF_ERROR(context->GetAttr("dilations", ¶ms->dilations));
283 TF_RETURN_IF_ERROR(context->GetAttr("strides", ¶ms->strides));
284 TF_RETURN_IF_ERROR(context->GetAttr("padding", ¶ms->padding));
285 if (context->HasAttr("explicit_paddings")) {
286 TF_RETURN_IF_ERROR(
287 context->GetAttr("explicit_paddings", ¶ms->explicit_paddings));
288 }
289 string data_format_string;
290 TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
291 TF_REQUIRES(FormatFromString(data_format_string, ¶ms->data_format),
292 errors::InvalidArgument("Invalid data format"));
293
294 const auto& strides = params->strides;
295 const auto& dilations = params->dilations;
296 const auto& data_format = params->data_format;
297
298 TF_REQUIRES(dilations.size() == 4,
299 errors::InvalidArgument("Sliding window dilations field must "
300 "specify 4 dimensions"));
301 TF_REQUIRES(strides.size() == 4,
302 errors::InvalidArgument("Sliding window strides field must "
303 "specify 4 dimensions"));
304 const int64 stride_n = GetTensorDim(strides, data_format, 'N');
305 const int64 stride_c = GetTensorDim(strides, data_format, 'C');
306 const int64 stride_h = GetTensorDim(strides, data_format, 'H');
307 const int64 stride_w = GetTensorDim(strides, data_format, 'W');
308 TF_REQUIRES(
309 stride_n == 1 && stride_c == 1,
310 errors::InvalidArgument("Current implementation does not yet support "
311 "strides in the batch and depth dimensions."));
312 TF_REQUIRES(stride_h > 0 && stride_w > 0,
313 errors::InvalidArgument(
314 "Row and column strides should be larger than 0."));
315
316 const int64 dilation_n = GetTensorDim(dilations, data_format, 'N');
317 const int64 dilation_c = GetTensorDim(dilations, data_format, 'C');
318 const int64 dilation_h = GetTensorDim(dilations, data_format, 'H');
319 const int64 dilation_w = GetTensorDim(dilations, data_format, 'W');
320 TF_REQUIRES(
321 dilation_n == 1 && dilation_c == 1,
322 errors::InvalidArgument("Current implementation does not yet support "
323 "dilations in the batch and depth dimensions."));
324 TF_REQUIRES(
325 dilation_h > 0 && dilation_w > 0,
326 errors::InvalidArgument("Dilated rates should be larger than 0."));
327
328 TF_RETURN_IF_ERROR(CheckValidPadding(params->padding,
329 params->explicit_paddings,
330 /*num_dims=*/4, data_format));
331
332 return Status::OK();
333 }
334
ComputeConv2DDimension(const Conv2DParameters & params,const Tensor & input,const Tensor & filter,Conv2DDimensions * dimensions)335 Status ComputeConv2DDimension(const Conv2DParameters& params,
336 const Tensor& input, const Tensor& filter,
337 Conv2DDimensions* dimensions) {
338 // Check that 2D convolution input and filter have exactly 4 dimensions.
339 TF_REQUIRES(input.dims() == 4,
340 errors::InvalidArgument("input must be 4-dimensional",
341 input.shape().DebugString()));
342 TF_REQUIRES(filter.dims() == 4,
343 errors::InvalidArgument("filter must be 4-dimensional: ",
344 filter.shape().DebugString()));
345 for (int i = 0; i < 3; i++) {
346 TF_REQUIRES(
347 FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
348 errors::InvalidArgument("filter too large"));
349 }
350
351 // The last dimension for input is in_depth. Check that it is the same as the
352 // filter's in_depth or it is evenly divisible by filter's in_depth.
353 const int64 in_depth_raw = GetTensorDim(input, params.data_format, 'C');
354 const int64 patch_depth_raw = filter.dim_size(2);
355 TF_REQUIRES(FastBoundsCheck(in_depth_raw, std::numeric_limits<int>::max()),
356 errors::InvalidArgument("Input depth too large"));
357 TF_REQUIRES(FastBoundsCheck(patch_depth_raw, std::numeric_limits<int>::max()),
358 errors::InvalidArgument("Patch depth too large"));
359 const int in_depth = static_cast<int>(in_depth_raw);
360 const int patch_depth = static_cast<int>(patch_depth_raw);
361 TF_REQUIRES(in_depth % patch_depth == 0,
362 errors::InvalidArgument(
363 "input depth must be evenly divisible by filter depth: ",
364 in_depth, " vs ", patch_depth));
365
366 // The last dimension for filter is out_depth.
367 const int out_depth = static_cast<int>(filter.dim_size(3));
368
369 // The second dimension for input is rows/height.
370 // The first dimension for filter is rows/height.
371 const int64 input_rows_raw = GetTensorDim(input, params.data_format, 'H');
372 TF_REQUIRES(FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
373 errors::InvalidArgument("Input rows too large"));
374 const int input_rows = static_cast<int>(input_rows_raw);
375 const int filter_rows = static_cast<int>(filter.dim_size(0));
376
377 // The third dimension for input is columns/width.
378 // The second dimension for filter is columns/width.
379 const int64 input_cols_raw = GetTensorDim(input, params.data_format, 'W');
380 TF_REQUIRES(FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
381 errors::InvalidArgument("Input cols too large"));
382 const int input_cols = static_cast<int>(input_cols_raw);
383 const int filter_cols = static_cast<int>(filter.dim_size(1));
384
385 // The first dimension for input is batch.
386 const int64 batch_raw = GetTensorDim(input, params.data_format, 'N');
387 TF_REQUIRES(FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
388 errors::InvalidArgument("batch is too large"));
389 const int batch = static_cast<int>(batch_raw);
390
391 // Take the stride and dilation from the second and third dimensions only (we
392 // do not support striding or dilation on the batch or depth dimension).
393 const int stride_rows = GetTensorDim(params.strides, params.data_format, 'H');
394 const int stride_cols = GetTensorDim(params.strides, params.data_format, 'W');
395 const int dilation_rows =
396 GetTensorDim(params.dilations, params.data_format, 'H');
397 const int dilation_cols =
398 GetTensorDim(params.dilations, params.data_format, 'W');
399
400 int64 pad_rows_before, pad_rows_after, pad_cols_before, pad_cols_after;
401 if (params.padding == Padding::EXPLICIT) {
402 GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'H',
403 &pad_rows_before, &pad_rows_after);
404 GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'W',
405 &pad_cols_before, &pad_cols_after);
406 }
407
408 // Compute windowed output sizes for rows and columns.
409 int64 out_rows = 0, out_cols = 0;
410 TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
411 input_rows, filter_rows, dilation_rows, stride_rows, params.padding,
412 &out_rows, &pad_rows_before, &pad_rows_after));
413 TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
414 input_cols, filter_cols, dilation_cols, stride_cols, params.padding,
415 &out_cols, &pad_cols_before, &pad_cols_after));
416
417 dimensions->batch = batch;
418 dimensions->input_rows = input_rows;
419 dimensions->input_cols = input_cols;
420 dimensions->in_depth = in_depth;
421 dimensions->filter_rows = filter_rows;
422 dimensions->filter_cols = filter_cols;
423 dimensions->patch_depth = patch_depth;
424 dimensions->out_depth = out_depth;
425 dimensions->stride_rows = stride_rows;
426 dimensions->stride_cols = stride_cols;
427 dimensions->dilation_rows = dilation_rows;
428 dimensions->dilation_cols = dilation_cols;
429 dimensions->out_rows = out_rows;
430 dimensions->out_cols = out_cols;
431 dimensions->pad_rows_before = pad_rows_before;
432 dimensions->pad_rows_after = pad_rows_after;
433 dimensions->pad_cols_before = pad_cols_before;
434 dimensions->pad_cols_after = pad_cols_after;
435
436 return Status::OK();
437 }
438
439 #undef TF_REQUIRES
440
441 template <typename Device, typename T>
442 class Conv2DOp : public BinaryOp<T> {
443 public:
Conv2DOp(OpKernelConstruction * context)444 explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
445 OP_REQUIRES_OK(context, InitConv2DParameters(context, ¶ms_));
446
447 OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
448 use_cudnn_ &= CanUseCudnn();
449 cudnn_use_autotune_ = CudnnUseAutotune();
450 }
451
Compute(OpKernelContext * context)452 void Compute(OpKernelContext* context) override {
453 // Input tensor is of the following dimensions:
454 // [ batch, in_rows, in_cols, in_depth ]
455 const Tensor& input = context->input(0);
456
457 // Input filter is of the following dimensions:
458 // [ filter_rows, filter_cols, in_depth, out_depth]
459 const Tensor& filter = context->input(1);
460
461 Conv2DDimensions dimensions;
462 OP_REQUIRES_OK(context,
463 ComputeConv2DDimension(params_, input, filter, &dimensions));
464
465 TensorShape out_shape = ShapeFromFormat(
466 params_.data_format, dimensions.batch, dimensions.out_rows,
467 dimensions.out_cols, dimensions.out_depth);
468
469 // Output tensor is of the following dimensions:
470 // [ in_batch, out_rows, out_cols, out_depth ]
471 Tensor* output = nullptr;
472 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
473
474 VLOG(2) << "Conv2D: in_depth = " << dimensions.in_depth
475 << ", patch_depth = " << dimensions.patch_depth
476 << ", input_cols = " << dimensions.input_cols
477 << ", filter_cols = " << dimensions.filter_cols
478 << ", input_rows = " << dimensions.input_rows
479 << ", filter_rows = " << dimensions.filter_rows
480 << ", stride_rows = " << dimensions.stride_rows
481 << ", stride_cols = " << dimensions.stride_cols
482 << ", dilation_rows = " << dimensions.dilation_rows
483 << ", dilation_cols = " << dimensions.dilation_cols
484 << ", out_depth = " << dimensions.out_depth;
485
486 // If there is nothing to compute, return.
487 if (out_shape.num_elements() == 0) {
488 return;
489 }
490
491 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
492 if (params_.padding != EXPLICIT &&
493 LaunchXsmmConvOp<Device, T>::Run(
494 context, input, filter, dimensions.batch, dimensions.input_rows,
495 dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
496 dimensions.filter_cols, dimensions.pad_rows_before,
497 dimensions.pad_cols_before, dimensions.out_rows,
498 dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
499 dimensions.dilation_cols, dimensions.stride_rows,
500 dimensions.stride_cols, output, params_.data_format)) {
501 return;
502 }
503 #endif
504
505 if (params_.padding != EXPLICIT &&
506 LaunchDeepConvOp<Device, T>::Run(
507 context, input, filter, dimensions.batch, dimensions.input_rows,
508 dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
509 dimensions.filter_cols, dimensions.pad_rows_before,
510 dimensions.pad_cols_before, dimensions.out_rows,
511 dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
512 dimensions.dilation_cols, dimensions.stride_rows,
513 dimensions.stride_cols, output, params_.data_format)) {
514 return;
515 }
516
517 launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
518 dimensions.dilation_rows, dimensions.dilation_cols,
519 dimensions.stride_rows, dimensions.stride_cols, params_.padding,
520 params_.explicit_paddings, output, params_.data_format);
521 }
522
523 private:
524 Conv2DParameters params_;
525 bool use_cudnn_;
526 bool cudnn_use_autotune_;
527
528 LaunchConv2DOp<Device, T> launcher_;
529
530 TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
531 };
532
533 #define REGISTER_CPU(T) \
534 REGISTER_KERNEL_BUILDER( \
535 Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
536 Conv2DOp<CPUDevice, T>);
537
538 // If we're using the alternative GEMM-based implementation of Conv2D for the
539 // CPU implementation, don't register this EigenTensor-based version.
540 #if !defined(USE_GEMM_FOR_CONV)
541 TF_CALL_half(REGISTER_CPU);
542 TF_CALL_float(REGISTER_CPU);
543 TF_CALL_double(REGISTER_CPU);
544 #endif // USE_GEMM_FOR_CONV
545
546 // To be used inside depthwise_conv_op.cc.
547 template struct LaunchConv2DOp<CPUDevice, Eigen::half>;
548 template struct LaunchConv2DOp<CPUDevice, float>;
549 template struct LaunchConv2DOp<CPUDevice, double>;
550
551 #if GOOGLE_CUDA
GetDnnWorkspaceLimit(const string & envvar_in_mb,int64 default_value_in_bytes)552 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
553 int64 default_value_in_bytes) {
554 const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
555 if (workspace_limit_in_mb_str != nullptr &&
556 strcmp(workspace_limit_in_mb_str, "") != 0) {
557 int64 scratch_limit_in_mb = -1;
558 if (strings::safe_strto64(workspace_limit_in_mb_str,
559 &scratch_limit_in_mb)) {
560 return scratch_limit_in_mb * (1 << 20);
561 } else {
562 LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": "
563 << workspace_limit_in_mb_str;
564 }
565 }
566 return default_value_in_bytes;
567 }
568
569 // A dummy type to group forward convolution autotune results together.
570 struct ConvAutoTuneGroup {
nametensorflow::ConvAutoTuneGroup571 static string name() { return "Conv"; }
572 };
573 typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
574 se::dnn::AlgorithmConfig>
575 AutoTuneConv;
576
577 template <typename T>
operator ()(OpKernelContext * ctx,bool use_cudnn,bool cudnn_use_autotune,const Tensor & input_param,const Tensor & filter,int row_dilation,int col_dilation,int row_stride,int col_stride,const Padding & padding,const std::vector<int64> & explicit_paddings,Tensor * output,TensorFormat data_format)578 void LaunchConv2DOp<GPUDevice, T>::operator()(
579 OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
580 const Tensor& input_param, const Tensor& filter, int row_dilation,
581 int col_dilation, int row_stride, int col_stride, const Padding& padding,
582 const std::vector<int64>& explicit_paddings, Tensor* output,
583 TensorFormat data_format) {
584 using se::dnn::AlgorithmConfig;
585 using se::dnn::AlgorithmDesc;
586 using se::dnn::ProfileResult;
587 auto* stream = ctx->op_device_context()->stream();
588 OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
589
590 if (!use_cudnn) {
591 ctx->SetStatus(
592 errors::Unimplemented("Conv2D for GPU is not currently supported "
593 "without cudnn"));
594 return;
595 }
596
597 Tensor input = input_param;
598 const int64 in_batch = GetTensorDim(input, data_format, 'N');
599 int64 in_rows = GetTensorDim(input, data_format, 'H');
600 int64 in_cols = GetTensorDim(input, data_format, 'W');
601 const int64 in_depths = GetTensorDim(input, data_format, 'C');
602 const int64 patch_rows = filter.dim_size(0);
603 const int64 patch_cols = filter.dim_size(1);
604 const int64 patch_depths = filter.dim_size(2);
605
606 // If the filter in-depth (patch_depths) is 1 and smaller than the input
607 // depth, it's a depthwise convolution. More generally, if the filter in-depth
608 // divides but is smaller than the input depth, it is a grouped convolution.
609 bool is_grouped_convolution = patch_depths != in_depths;
610 if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
611 row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
612 col_stride == 1 && data_format == FORMAT_NHWC &&
613 (padding == VALID || padding == SAME)) {
614 // 1x1 filter, so call cublas directly.
615 const uint64 m = in_batch * in_rows * in_cols;
616 const uint64 k = patch_depths;
617 const uint64 n = filter.dim_size(3);
618
619 auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
620 input.template flat<T>().size());
621 auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
622 filter.template flat<T>().size());
623 auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
624 output->template flat<T>().size());
625
626 auto no_transpose = se::blas::Transpose::kNoTranspose;
627 bool blas_launch_status =
628 stream
629 ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
630 a_ptr, k, 0.0f, &c_ptr, n)
631 .ok();
632 if (!blas_launch_status) {
633 ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
634 ", n=", n, ", k=", k));
635 }
636 return;
637 } else if (patch_rows == in_rows && patch_cols == in_cols &&
638 !is_grouped_convolution && row_dilation == 1 &&
639 col_dilation == 1 && padding == VALID &&
640 data_format == FORMAT_NHWC) {
641 // The input data and filter have the same height/width, so call cublas
642 // directly.
643 const uint64 m = in_batch;
644 const uint64 k = patch_rows * patch_cols * patch_depths;
645 const uint64 n = filter.dim_size(3);
646
647 auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
648 input.template flat<T>().size());
649 auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
650 filter.template flat<T>().size());
651 auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
652 output->template flat<T>().size());
653
654 auto no_transpose = se::blas::Transpose::kNoTranspose;
655 bool blas_launch_status =
656 stream
657 ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
658 a_ptr, k, 0.0f, &c_ptr, n)
659 .ok();
660 if (!blas_launch_status) {
661 ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
662 ", n=", n, ", k=", k));
663 }
664 return;
665 }
666
667 const int64 out_batch = GetTensorDim(*output, data_format, 'N');
668 const int64 out_rows = GetTensorDim(*output, data_format, 'H');
669 const int64 out_cols = GetTensorDim(*output, data_format, 'W');
670 const int64 out_depths = GetTensorDim(*output, data_format, 'C');
671 int64 padding_top = -1, padding_bottom = -1;
672 int64 padding_left = -1, padding_right = -1;
673 if (padding == EXPLICIT) {
674 GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
675 &padding_bottom);
676 GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
677 &padding_right);
678 }
679 int64 out_rows_check, out_cols_check;
680 Status status = GetWindowedOutputSizeVerboseV2(
681 in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check,
682 &padding_top, &padding_bottom);
683 // The status is guaranteed to be OK because we checked the output and padding
684 // was valid earlier.
685 TF_CHECK_OK(status);
686 DCHECK_EQ(out_rows, out_rows_check);
687 status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation,
688 col_stride, padding, &out_cols_check,
689 &padding_left, &padding_right);
690 TF_CHECK_OK(status);
691 DCHECK_EQ(out_cols, out_cols_check);
692
693 const int64 common_padding_rows = std::min(padding_top, padding_bottom);
694 const int64 common_padding_cols = std::min(padding_left, padding_right);
695 if (padding_top != padding_bottom || padding_left != padding_right) {
696 // cuDNN only supports padding the same amount on the left and right sides,
697 // and on the top and bottom sides. So we manually create a new padded
698 // input tensor such that we can pass it to cuDNN.
699
700 // TODO(reedwm): In some cases, we can avoid an allocation even if the two
701 // padding sides are different. For example, if the input is 2x2, the filter
702 // is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is
703 // equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in
704 // such a way would allow us to avoid the allocation.
705 Tensor transformed_input;
706 const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
707 const int64 padding_cols_diff = std::abs(padding_right - padding_left);
708 const int64 new_in_rows = in_rows + padding_rows_diff;
709 const int64 new_in_cols = in_cols + padding_cols_diff;
710 OP_REQUIRES_OK(ctx, ctx->allocate_temp(
711 DataTypeToEnum<T>::value,
712 ShapeFromFormat(data_format, in_batch, new_in_rows,
713 new_in_cols, in_depths),
714 &transformed_input));
715
716 const int64 input_pad_top = padding_top - common_padding_rows;
717 const int64 input_pad_bottom = padding_bottom - common_padding_rows;
718 const int64 input_pad_left = padding_left - common_padding_cols;
719 const int64 input_pad_right = padding_right - common_padding_cols;
720 bool in_bounds =
721 FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
722 FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
723 FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
724 FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
725 if (!in_bounds) {
726 ctx->SetStatus(errors::InvalidArgument("Padding is too large."));
727 return;
728 }
729 functor::PadInput<GPUDevice, T, int, 4>()(
730 ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
731 {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
732 {{static_cast<int>(input_pad_bottom),
733 static_cast<int>(input_pad_right)}},
734 To32Bit(transformed_input.tensor<T, 4>()), data_format);
735
736 input = transformed_input;
737 in_rows = new_in_rows;
738 in_cols = new_in_cols;
739 }
740
741 if (data_format == FORMAT_NHWC) {
742 // Convert the input tensor from NHWC to NCHW.
743 TensorShape nchw_shape =
744 ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
745 if (in_depths > 1) {
746 Tensor transformed_input;
747 OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
748 nchw_shape, &transformed_input));
749 functor::NHWCToNCHW<GPUDevice, T, 4>()(
750 ctx->eigen_device<GPUDevice>(),
751 const_cast<const Tensor&>(input).tensor<T, 4>(),
752 transformed_input.tensor<T, 4>());
753 input = transformed_input;
754 } else {
755 // If depth <= 1, then just reshape.
756 CHECK(input.CopyFrom(input, nchw_shape));
757 }
758 }
759
760 CHECK(common_padding_rows >= 0 && common_padding_cols >= 0) // Crash OK
761 << "Negative row or col paddings: (" << common_padding_rows << ", "
762 << common_padding_cols << ")";
763 se::dnn::BatchDescriptor input_desc;
764 input_desc.set_count(in_batch)
765 .set_feature_map_count(in_depths)
766 .set_height(in_rows)
767 .set_width(in_cols)
768 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
769 se::dnn::BatchDescriptor output_desc;
770 output_desc.set_count(out_batch)
771 .set_height(out_rows)
772 .set_width(out_cols)
773 .set_feature_map_count(out_depths)
774 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
775 se::dnn::FilterDescriptor filter_desc;
776 filter_desc.set_input_filter_height(patch_rows)
777 .set_input_filter_width(patch_cols)
778 .set_input_feature_map_count(patch_depths)
779 .set_output_feature_map_count(filter.dim_size(3));
780 se::dnn::ConvolutionDescriptor conv_desc;
781 conv_desc.set_vertical_dilation_rate(row_dilation)
782 .set_horizontal_dilation_rate(col_dilation)
783 .set_vertical_filter_stride(row_stride)
784 .set_horizontal_filter_stride(col_stride)
785 .set_zero_padding_height(common_padding_rows)
786 .set_zero_padding_width(common_padding_cols)
787 .set_group_count(in_depths / patch_depths);
788
789 Tensor transformed_filter;
790 OP_REQUIRES_OK(ctx, ctx->allocate_temp(
791 DataTypeToEnum<T>::value,
792 TensorShape({filter.dim_size(3), filter.dim_size(2),
793 filter.dim_size(0), filter.dim_size(1)}),
794 &transformed_filter));
795 functor::TransformFilter<GPUDevice, T, int, 4>()(
796 ctx->eigen_device<GPUDevice>(), FORMAT_OIHW,
797 To32Bit(filter.tensor<T, 4>()),
798 To32Bit(transformed_filter.tensor<T, 4>()));
799
800 Tensor transformed_output;
801 if (data_format == FORMAT_NHWC) {
802 // Only allocate temporary memory when a layout transformation is needed.
803 OP_REQUIRES_OK(
804 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
805 ShapeFromFormat(FORMAT_NCHW, out_batch,
806 out_rows, out_cols, out_depths),
807 &transformed_output));
808 } else {
809 transformed_output = *output;
810 }
811
812 auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
813 input.template flat<T>().size());
814 auto filter_ptr =
815 AsDeviceMemory(transformed_filter.template flat<T>().data(),
816 transformed_filter.template flat<T>().size());
817 auto output_ptr =
818 AsDeviceMemory(transformed_output.template flat<T>().data(),
819 transformed_output.template flat<T>().size());
820
821 static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
822 // default value is in bytes despite the name of the environment variable
823 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
824 );
825
826 int device_id = stream->parent()->device_ordinal();
827 DataType dtype = input.dtype();
828 ConvParameters conv_parameters = {
829 in_batch, // batch
830 in_depths, // in_depths
831 {{in_rows, // in_rows
832 in_cols}}, // in_cols
833 FORMAT_NCHW, // compute_data_format
834 out_depths, // out_depths
835 {{patch_rows, // filter_rows
836 patch_cols, // filter_cols
837 patch_depths}}, // filter_depths
838 {{row_dilation, // dilation_rows
839 col_dilation}}, // dilation_cols
840 {{row_stride, // stride_rows
841 col_stride}}, // stride_cols
842 {{common_padding_rows, // padding_rows
843 common_padding_cols}}, // padding_cols
844 dtype, // tensor datatype
845 device_id, // device_id
846 };
847 AlgorithmConfig algorithm_config;
848 if (cudnn_use_autotune &&
849 !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
850 std::vector<AlgorithmDesc> algorithms;
851 OP_REQUIRES(
852 ctx,
853 stream->parent()->GetConvolveAlgorithms(
854 conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
855 stream->parent()),
856 &algorithms),
857 errors::Unknown("Failed to get convolution algorithm. This is probably "
858 "because cuDNN failed to initialize, so try looking to "
859 "see if a warning log message was printed above."));
860 std::vector<tensorflow::AutotuneResult> results;
861 for (auto profile_algorithm : algorithms) {
862 // TODO(zhengxq): profile each algorithm multiple times to better
863 // accuracy.
864 DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
865 ProfileResult profile_result;
866 bool cudnn_launch_status =
867 stream
868 ->ThenConvolveWithAlgorithm(
869 input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
870 output_desc, &output_ptr, &scratch_allocator,
871 AlgorithmConfig(profile_algorithm), &profile_result)
872 .ok();
873 if (cudnn_launch_status) {
874 if (profile_result.is_valid()) {
875 results.emplace_back();
876 auto& result = results.back();
877 result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
878 result.mutable_conv()->set_tensor_ops_enabled(
879 profile_algorithm.tensor_ops_enabled());
880 result.mutable_success()->set_scratch_bytes(
881 scratch_allocator.TotalByteSize());
882 *result.mutable_success()->mutable_run_time() =
883 proto_utils::ToDurationProto(
884 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
885 }
886 }
887 }
888 LogConvAutotuneResults(ctx->op_kernel().def(), input, transformed_filter,
889 transformed_output, stream->parent(), results);
890 OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
891 AutoTuneConv::GetInstance()->Insert(conv_parameters, algorithm_config);
892 }
893
894 DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
895 bool cudnn_launch_status =
896 stream
897 ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
898 filter_ptr, conv_desc, output_desc,
899 &output_ptr, &scratch_allocator,
900 algorithm_config, nullptr)
901 .ok();
902
903 if (!cudnn_launch_status) {
904 ctx->SetStatus(errors::Internal(
905 "cuDNN launch failure : input shape(", input.shape().DebugString(),
906 ") filter shape(", filter.shape().DebugString(), ")"));
907 }
908
909 // Convert the output tensor back from NCHW to NHWC.
910 if (data_format == FORMAT_NHWC) {
911 functor::NCHWToNHWC<GPUDevice, T, 4>()(
912 ctx->eigen_device<GPUDevice>(),
913 const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
914 output->tensor<T, 4>());
915 }
916 }
917
918 // Forward declarations of the functor specializations for GPU.
919 namespace functor {
920 #define DECLARE_GPU_SPEC(T) \
921 template <> \
922 void SpatialConvolution<GPUDevice, T>::operator()( \
923 const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \
924 typename TTypes<T, 4>::ConstTensor input, \
925 typename TTypes<T, 4>::ConstTensor filter, int row_stride, \
926 int col_stride, int row_dilation, int col_dilation, \
927 const Eigen::PaddingType& padding, \
928 const Eigen::NoOpOutputKernel& output_kernel); \
929 extern template struct SpatialConvolution<GPUDevice, T>; \
930 template <> \
931 void MatMulConvFunctor<GPUDevice, T>::operator()( \
932 const GPUDevice& d, typename TTypes<T, 2>::Tensor out, \
933 typename TTypes<T, 2>::ConstTensor in0, \
934 typename TTypes<T, 2>::ConstTensor in1, \
935 const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, \
936 const Eigen::NoOpOutputKernel& output_kernel); \
937 extern template struct MatMulConvFunctor<GPUDevice, T>; \
938 template <> \
939 void TransformFilter<GPUDevice, T, int, 4>::operator()( \
940 const GPUDevice& d, FilterTensorFormat dst_filter_format, \
941 typename TTypes<T, 4, int>::ConstTensor in, \
942 typename TTypes<T, 4, int>::Tensor out); \
943 extern template struct TransformFilter<GPUDevice, T, int, 4>; \
944 template <> \
945 void PadInput<GPUDevice, T, int, 4>::operator()( \
946 const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \
947 const std::array<int, 2>& padding_left, \
948 const std::array<int, 2>& padding_right, \
949 typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
950 extern template struct PadInput<GPUDevice, T, int, 4>
951
952 DECLARE_GPU_SPEC(float);
953 DECLARE_GPU_SPEC(Eigen::half);
954 DECLARE_GPU_SPEC(double);
955 #undef DECLARE_GPU_SPEC
956 } // namespace functor
957
958 // Registration of the GPU implementations.
959 REGISTER_KERNEL_BUILDER(
960 Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
961 Conv2DOp<GPUDevice, Eigen::half>);
962 REGISTER_KERNEL_BUILDER(
963 Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
964 Conv2DOp<GPUDevice, float>);
965 REGISTER_KERNEL_BUILDER(
966 Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
967 Conv2DOp<GPUDevice, double>);
968
969 // To be used inside depthwise_conv_op.cc.
970 template struct LaunchConv2DOp<GPUDevice, float>;
971 template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
972 template struct LaunchConv2DOp<GPUDevice, double>;
973
974 #endif // GOOGLE_CUDA
975
976 } // namespace tensorflow
977