• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/nn_ops.cc.
17 
18 #include "tensorflow/core/kernels/conv_grad_input_ops.h"
19 
20 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
21 
22 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
23 #include "tensorflow/core/protobuf/autotuning.pb.h"
24 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
25 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
26 
27 namespace tensorflow {
28 
29 typedef Eigen::ThreadPoolDevice CPUDevice;
30 typedef Eigen::GpuDevice GPUDevice;
31 
32 // To be used inside depthwise_conv_grad_op.cc.
33 template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
34 template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
35 template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
36 
37 // GPU definitions.
38 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
39 // The slow version (but compiles for GPU)
40 
41 // A dummy type to group forward backward data autotune results together.
42 struct ConvBackwardDataAutotuneGroup {
nametensorflow::ConvBackwardDataAutotuneGroup43   static string name() { return "ConvBwdData"; }
44 };
45 
46 typedef AutotuneSingleton<ConvBackwardDataAutotuneGroup, ConvParameters,
47                           se::dnn::AlgorithmConfig>
48     AutotuneConvBwdData;
49 
50 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
51 // Computes backprop input using Eigen::SpatialConvolutionBackwardInput on GPU
52 // for int32 inputs.
53 template <>
54 struct LaunchConv2DBackpropInputOp<GPUDevice, int32> {
operator ()tensorflow::LaunchConv2DBackpropInputOp55   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
56                   const Tensor& out_backprop, const Tensor& filter,
57                   int row_dilation, int col_dilation, int row_stride,
58                   int col_stride, const Padding& padding,
59                   const std::vector<int64>& explicit_paddings,
60                   Tensor* in_backprop, TensorFormat data_format) {
61     LaunchConv2DBackpropInputOpImpl<GPUDevice, int32> launcher;
62     launcher(ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter,
63              row_dilation, col_dilation, row_stride, col_stride, padding,
64              explicit_paddings, in_backprop, data_format);
65   }
66 };
67 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
68 
69 template <typename T>
operator ()(OpKernelContext * ctx,bool use_cudnn,bool cudnn_use_autotune,const Tensor & out_backprop,const Tensor & filter,int row_dilation,int col_dilation,int row_stride,int col_stride,const Padding & padding,const std::vector<int64> & explicit_paddings,Tensor * in_backprop,TensorFormat data_format)70 void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
71     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
72     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
73     int col_dilation, int row_stride, int col_stride, const Padding& padding,
74     const std::vector<int64>& explicit_paddings, Tensor* in_backprop,
75     TensorFormat data_format) {
76   using se::dnn::AlgorithmConfig;
77   using se::dnn::AlgorithmDesc;
78   using se::dnn::ProfileResult;
79 
80   std::vector<int32> strides(4, 1);
81   std::vector<int32> dilations(4, 1);
82   auto input_h = GetTensorDimIndex(data_format, 'H');
83   auto input_w = GetTensorDimIndex(data_format, 'W');
84   strides[input_h] = row_stride;
85   strides[input_w] = col_stride;
86   dilations[input_h] = row_dilation;
87   dilations[input_w] = col_dilation;
88   TensorShape input_shape = in_backprop->shape();
89 
90   const TensorShape& filter_shape = filter.shape();
91   ConvBackpropDimensions dims;
92   OP_REQUIRES_OK(
93       ctx, ConvBackpropComputeDimensionsV2(
94                "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, input_shape,
95                filter_shape, out_backprop.shape(), dilations, strides, padding,
96                explicit_paddings, data_format, &dims));
97 
98   int64_t padding_top = -1, padding_bottom = -1;
99   int64_t padding_left = -1, padding_right = -1;
100   if (padding == EXPLICIT) {
101     GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
102                              &padding_bottom);
103     GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
104                              &padding_right);
105   }
106   int64_t expected_out_rows, expected_out_cols;
107   // The function is guaranteed to succeed because we checked the output and
108   // padding was valid earlier.
109   TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
110       dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
111       row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
112       &padding_bottom));
113   DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
114   TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
115       dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
116       col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
117       &padding_right));
118   DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
119 
120   auto* stream = ctx->op_device_context()->stream();
121   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
122 
123   if (!use_cudnn) {
124     ctx->SetStatus(errors::Unimplemented(
125         "Conv2DBackpropInput for GPU is not currently supported "
126         "without cudnn"));
127     return;
128   }
129 
130   // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
131   // input depth, it's a depthwise convolution. More generally, if the filter
132   // in-depth divides but is smaller than the input depth, it is a grouped
133   // convolution.
134   bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth;
135   if (dims.spatial_dims[0].filter_size == 1 &&
136       dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
137       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
138       data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
139     // 1x1 filter, so call cublas directly.
140     const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
141                      dims.spatial_dims[1].input_size;
142     const uint64 k = dims.out_depth;
143     const uint64 n = dims.in_depth;
144 
145     auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
146                                 out_backprop.template flat<T>().size());
147     auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
148                                 filter.template flat<T>().size());
149     auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
150                                 in_backprop->template flat<T>().size());
151 
152     auto transpose = se::blas::Transpose::kTranspose;
153     auto no_transpose = se::blas::Transpose::kNoTranspose;
154 
155     OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k,
156                                              b_ptr, k, a_ptr, k, &c_ptr, n));
157     return;
158   } else if (dims.spatial_dims[0].filter_size ==
159                  dims.spatial_dims[0].input_size &&
160              dims.spatial_dims[1].filter_size ==
161                  dims.spatial_dims[1].input_size &&
162              !is_grouped_convolution && padding == VALID &&
163              data_format == FORMAT_NHWC) {
164     // The input data and filter have the same height/width, and we are not
165     // using grouped convolution, so call cublas directly.
166     const uint64 m = dims.batch_size;
167     const uint64 k = dims.out_depth;
168     const uint64 n = dims.spatial_dims[0].input_size *
169                      dims.spatial_dims[1].input_size * dims.in_depth;
170 
171     auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
172                                 out_backprop.template flat<T>().size());
173     auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
174                                 filter.template flat<T>().size());
175     auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
176                                 in_backprop->template flat<T>().size());
177 
178     auto transpose = se::blas::Transpose::kTranspose;
179     auto no_transpose = se::blas::Transpose::kNoTranspose;
180 
181     OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k,
182                                              b_ptr, k, a_ptr, k, &c_ptr, n));
183     return;
184   }
185 
186   const int64_t common_padding_rows = std::min(padding_top, padding_bottom);
187   const int64_t common_padding_cols = std::min(padding_left, padding_right);
188   TensorShape compatible_input_shape;
189   if (padding_top != padding_bottom || padding_left != padding_right) {
190     // Pad the input in the same way we did during the forward pass, so that
191     // cuDNN or MIOpen receives the same input during the backward pass function
192     // as it did during the forward pass function.
193     const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top);
194     const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
195     const int64_t new_in_rows =
196         dims.spatial_dims[0].input_size + padding_rows_diff;
197     const int64_t new_in_cols =
198         dims.spatial_dims[1].input_size + padding_cols_diff;
199     compatible_input_shape = ShapeFromFormat(
200         data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth);
201   } else {
202     compatible_input_shape = input_shape;
203   }
204 
205   CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
206       << "Negative row or col paddings: (" << common_padding_rows << ", "
207       << common_padding_cols << ")";
208 
209   // The Tensor Core in NVIDIA Volta+ GPUs supports efficient convolution with
210   // fp16 in NHWC data layout. In all other configurations it's more efficient
211   // to run computation in NCHW data format.
212   const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
213                                stream->GetCudaComputeCapability().IsAtLeast(
214                                    se::CudaComputeCapability::VOLTA);
215 
216   // We only do one directional conversion: NHWC->NCHW. We never convert in the
217   // other direction. Grappler layout optimizer selects the preferred layout and
218   // adds necessary annotations to the graph.
219   const TensorFormat compute_data_format =
220       (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
221                                                       : FORMAT_NCHW;
222 
223   VLOG(3) << "Compute Conv2DBackpropInput with cuDNN:"
224           << " data_format=" << ToString(data_format)
225           << " compute_data_format=" << ToString(compute_data_format);
226 
227   constexpr auto kComputeInNHWC =
228       std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
229                       se::dnn::FilterLayout::kOutputYXInput);
230   constexpr auto kComputeInNCHW =
231       std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
232                       se::dnn::FilterLayout::kOutputInputYX);
233 
234   se::dnn::DataLayout compute_data_layout;
235   se::dnn::FilterLayout filter_layout;
236 
237   std::tie(compute_data_layout, filter_layout) =
238       compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
239 
240   se::dnn::BatchDescriptor input_desc;
241   input_desc.set_count(dims.batch_size)
242       .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
243       .set_width(GetTensorDim(compatible_input_shape, data_format, 'W'))
244       .set_feature_map_count(dims.in_depth)
245       .set_layout(compute_data_layout);
246   se::dnn::BatchDescriptor output_desc;
247   output_desc.set_count(dims.batch_size)
248       .set_height(dims.spatial_dims[0].output_size)
249       .set_width(dims.spatial_dims[1].output_size)
250       .set_feature_map_count(dims.out_depth)
251       .set_layout(compute_data_layout);
252   se::dnn::FilterDescriptor filter_desc;
253   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
254       .set_input_filter_width(dims.spatial_dims[1].filter_size)
255       .set_input_feature_map_count(filter_shape.dim_size(2))
256       .set_output_feature_map_count(filter_shape.dim_size(3))
257       .set_layout(filter_layout);
258   se::dnn::ConvolutionDescriptor conv_desc;
259   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
260       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
261       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
262       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
263       .set_zero_padding_height(common_padding_rows)
264       .set_zero_padding_width(common_padding_cols)
265       .set_group_count(dims.in_depth / filter_shape.dim_size(2));
266 
267   // Tensorflow filter format: HWIO
268   // cuDNN filter formats: (data format) -> (filter format)
269   //   (1) NCHW -> OIHW
270   //   (2) NHWC -> OHWI
271 
272   Tensor transformed_filter;
273   const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
274     VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
275             << " to " << ToString(dst_format);
276 
277     TensorShape dst_shape =
278         dst_format == FORMAT_OIHW
279             ? TensorShape({filter.dim_size(3), filter.dim_size(2),
280                            filter.dim_size(0), filter.dim_size(1)})
281             : TensorShape({filter.dim_size(3), filter.dim_size(0),
282                            filter.dim_size(1), filter.dim_size(2)});
283 
284     TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
285                                           &transformed_filter));
286     functor::TransformFilter<GPUDevice, T, int, 4>()(
287         ctx->eigen_device<GPUDevice>(), dst_format,
288         To32Bit(filter.tensor<T, 4>()),
289         To32Bit(transformed_filter.tensor<T, 4>()));
290 
291     return Status::OK();
292   };
293 
294   if (compute_data_format == FORMAT_NCHW) {
295     OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW));
296   } else if (compute_data_format == FORMAT_NHWC) {
297     OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI));
298   } else {
299     ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
300                                            ToString(compute_data_format)));
301     return;
302   }
303 
304   Tensor transformed_out_backprop;
305   if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
306     VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW.";
307     TensorShape compute_shape = ShapeFromFormat(
308         compute_data_format, dims.batch_size, dims.spatial_dims[0].output_size,
309         dims.spatial_dims[1].output_size, dims.out_depth);
310     if (dims.out_depth > 1) {
311       OP_REQUIRES_OK(ctx,
312                      ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
313                                         &transformed_out_backprop));
314       functor::NHWCToNCHW<GPUDevice, T, 4>()(
315           ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
316           transformed_out_backprop.tensor<T, 4>());
317     } else {
318       // If depth <= 1, then just reshape.
319       CHECK(transformed_out_backprop.CopyFrom(out_backprop, compute_shape));
320     }
321   } else {
322     transformed_out_backprop = out_backprop;
323   }
324 
325   Tensor pre_transformed_in_backprop;
326   OP_REQUIRES_OK(
327       ctx, ctx->allocate_temp(
328                DataTypeToEnum<T>::value,
329                ShapeFromFormat(
330                    compute_data_format,
331                    GetTensorDim(compatible_input_shape, data_format, 'N'),
332                    GetTensorDim(compatible_input_shape, data_format, 'H'),
333                    GetTensorDim(compatible_input_shape, data_format, 'W'),
334                    GetTensorDim(compatible_input_shape, data_format, 'C')),
335                &pre_transformed_in_backprop));
336 
337   auto out_backprop_ptr =
338       AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
339                      transformed_out_backprop.template flat<T>().size());
340   auto filter_ptr =
341       AsDeviceMemory(transformed_filter.template flat<T>().data(),
342                      transformed_filter.template flat<T>().size());
343   auto in_backprop_ptr =
344       AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
345                      pre_transformed_in_backprop.template flat<T>().size());
346 
347   int64_t workspace_bytes = 1LL << 32;  // 4GB by default.
348   // CuDNN frontend will expose more engines some of which might use too much
349   // workspace. This would increase the overall demand of memory when training
350   // models.
351   if (CudnnUseFrontend()) {
352     workspace_bytes = 1LL << 30;  // 1GB by default.
353   }
354   static int64_t ConvolveBackwardDataScratchSize =
355       GetDnnWorkspaceLimit("TF_CUDNN_WORKSPACE_LIMIT_IN_MB", workspace_bytes);
356   DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
357   int device_id = stream->parent()->device_ordinal();
358   DataType dtype = out_backprop.dtype();
359   ConvParameters conv_parameters = {
360       dims.batch_size,                     // batch
361       dims.in_depth,                       // in_depths
362       {{input_desc.height(),               // in_rows
363         input_desc.width()}},              // in_cols
364       compute_data_format,                 // compute_data_format
365       dims.out_depth,                      // out_depths
366       {{dims.spatial_dims[0].filter_size,  // filter_rows
367         dims.spatial_dims[1].filter_size,  // filter_cols
368         filter_shape.dim_size(2)}},        // filter_depths
369       {{dims.spatial_dims[0].dilation,     // dilation_rows
370         dims.spatial_dims[1].dilation}},   // dilation_cols
371       {{dims.spatial_dims[0].stride,       // stride_rows
372         dims.spatial_dims[1].stride}},     // stride_cols
373       {{common_padding_rows,               // padding_rows
374         common_padding_cols}},             // padding_cols
375       dtype,                               // tensor data type
376       device_id,                           // device_id
377       conv_desc.group_count()              // group_count
378   };
379 #if TENSORFLOW_USE_ROCM
380   // cudnn_use_autotune is applicable only the CUDA flow
381   // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
382   // if we do not have a cached algorithm_config for this conv_parameters
383   cudnn_use_autotune = true;
384 #endif
385   AlgorithmConfig algorithm_config;
386 
387   if (cudnn_use_autotune && !AutotuneConvBwdData::GetInstance()->Find(
388                                 conv_parameters, &algorithm_config)) {
389     profiler::ScopedAnnotation trace("cudnn_autotuning");
390 
391     std::vector<std::unique_ptr<se::dnn::ConvolveExecutionPlan>> plans;
392 #if GOOGLE_CUDA
393     std::vector<AlgorithmDesc> algorithms;
394     std::vector<AlgorithmConfig> configs;
395     if (CudnnUseFrontend()) {
396       OP_REQUIRES(
397           ctx,
398           stream->parent()->GetConvolveExecutionPlans(
399               se::dnn::ConvolutionKind::BACKWARD_DATA,
400               se::dnn::ToDataType<T>::value, stream, input_desc, filter_desc,
401               output_desc, conv_desc, &plans),
402           errors::Unknown("Failed to get convolution execution plan. This is "
403                           "probably because cuDNN failed to initialize, so try "
404                           "looking to see if a warning log message was printed "
405                           "above."));
406       for (const auto& plan : plans) {
407         configs.push_back(
408             AlgorithmConfig(AlgorithmDesc{plan->getTag(), plan->get_raw_desc()},
409                             plan->getWorkspaceSize()));
410       }
411     } else {
412       OP_REQUIRES(
413           ctx, stream->parent()->GetConvolveBackwardDataAlgorithms(&algorithms),
414           errors::Unknown("Failed to get convolution execution plan. This is "
415                           "probably because cuDNN failed to initialize, so try "
416                           "looking to see if a warning log message was printed "
417                           "above."));
418       for (const auto& algorithm : algorithms) {
419         configs.push_back(AlgorithmConfig(algorithm));
420       }
421     }
422 
423     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
424                                                 stream);
425 
426     se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
427                                       se::GpuAsmOpts());
428 
429     se::DeviceMemory<T> in_backprop_ptr_rz(
430         WrapRedzoneBestEffort(&rz_allocator, in_backprop_ptr));
431 
432     std::vector<tensorflow::AutotuneResult> results;
433     for (auto& profile_config : configs) {
434       // TODO(zhengxq): profile each algorithm multiple times to better
435       // accuracy.
436       DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
437                                             ctx);
438       se::RedzoneAllocator rz_scratch_allocator(
439           stream, &tf_allocator_adapter, se::GpuAsmOpts(),
440           /*memory_limit=*/ConvolveBackwardDataScratchSize);
441       se::ScratchAllocator* allocator_used =
442           !RedzoneCheckDisabled()
443               ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
444               : static_cast<se::ScratchAllocator*>(&scratch_allocator);
445       ProfileResult profile_result;
446       Status cudnn_launch_status;
447       if (CudnnUseFrontend()) {
448         cudnn_launch_status = stream->ConvolveBackwardDataWithExecutionPlan(
449             filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
450             input_desc, &in_backprop_ptr_rz, allocator_used, profile_config,
451             &profile_result);
452       } else {
453         cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
454             filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
455             input_desc, &in_backprop_ptr_rz, allocator_used, profile_config,
456             &profile_result);
457       }
458 
459       if (cudnn_launch_status.ok() && profile_result.is_valid()) {
460         results.emplace_back();
461         auto& result = results.back();
462         if (CudnnUseFrontend()) {
463           result.mutable_cuda_conv_plan()->set_exec_plan_id(
464               profile_config.algorithm()->exec_plan_id());
465         } else {
466           result.mutable_conv()->set_algorithm(
467               profile_config.algorithm()->algo_id());
468           result.mutable_conv()->set_tensor_ops_enabled(
469               profile_config.algorithm()->tensor_ops_enabled());
470         }
471 
472         result.set_scratch_bytes(
473             !RedzoneCheckDisabled()
474                 ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
475                 : scratch_allocator.TotalByteSize());
476         *result.mutable_run_time() = proto_utils::ToDurationProto(
477             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
478 
479         CheckRedzones(rz_scratch_allocator, &result);
480         CheckRedzones(rz_allocator, &result);
481       } else if (CudnnUseFrontend()) {
482         // When CuDNN frontend APIs are used, we need to make sure the profiling
483         // results are one-to-one mapping of the "plans". So, we insert dummy
484         // results when the excution fails.
485         results.emplace_back();
486         auto& result = results.back();
487         result.mutable_failure()->set_kind(AutotuneResult::UNKNOWN);
488         result.mutable_failure()->set_msg(
489             absl::StrCat("Profiling failure on CUDNN engine: ",
490                          profile_config.algorithm()->exec_plan_id()));
491       }
492     }
493 #elif TENSORFLOW_USE_ROCM
494     DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
495     std::vector<ProfileResult> algorithms;
496     OP_REQUIRES(
497         ctx,
498         stream->parent()->GetMIOpenConvolveAlgorithms(
499             se::dnn::ConvolutionKind::BACKWARD_DATA,
500             se::dnn::ToDataType<T>::value, stream, input_desc, in_backprop_ptr,
501             filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
502             &scratch_allocator, &algorithms),
503         errors::Unknown(
504             "Failed to get convolution algorithm. This is probably "
505             "because MIOpen failed to initialize, so try looking to "
506             "see if a warning log message was printed above."));
507 
508     std::vector<tensorflow::AutotuneResult> results;
509     if (algorithms.size() == 1) {
510       auto profile_result = algorithms[0];
511       results.emplace_back();
512       auto& result = results.back();
513       result.mutable_conv()->set_algorithm(
514           profile_result.algorithm().algo_id());
515       result.mutable_conv()->set_tensor_ops_enabled(
516           profile_result.algorithm().tensor_ops_enabled());
517 
518       result.set_scratch_bytes(profile_result.scratch_size());
519       *result.mutable_run_time() = proto_utils::ToDurationProto(
520           absl::Milliseconds(profile_result.elapsed_time_in_ms()));
521     } else {
522       for (auto miopen_algorithm : algorithms) {
523         auto profile_algorithm = miopen_algorithm.algorithm();
524         ProfileResult profile_result;
525         auto miopen_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
526             filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
527             input_desc, &in_backprop_ptr, &scratch_allocator,
528             AlgorithmConfig(profile_algorithm, miopen_algorithm.scratch_size()),
529             &profile_result);
530 
531         if (miopen_launch_status.ok() && profile_result.is_valid()) {
532           results.emplace_back();
533           auto& result = results.back();
534           result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
535           result.mutable_conv()->set_tensor_ops_enabled(
536               profile_algorithm.tensor_ops_enabled());
537           result.set_scratch_bytes(scratch_allocator.TotalByteSize());
538           *result.mutable_run_time() = proto_utils::ToDurationProto(
539               absl::Milliseconds(profile_result.elapsed_time_in_ms()));
540         }
541       }
542     }
543 #endif
544     LogConvAutotuneResults(
545         se::dnn::ConvolutionKind::BACKWARD_DATA, se::dnn::ToDataType<T>::value,
546         in_backprop_ptr, filter_ptr, out_backprop_ptr, input_desc, filter_desc,
547         output_desc, conv_desc, stream->parent(), results);
548     if (CudnnUseFrontend()) {
549       OP_REQUIRES_OK(
550           ctx, BestCudnnConvAlgorithm(results, &plans, &algorithm_config));
551 
552     } else {
553       OP_REQUIRES_OK(
554           ctx, BestCudnnConvAlgorithm(results, nullptr, &algorithm_config));
555     }
556     AutotuneConvBwdData::GetInstance()->Insert(conv_parameters,
557                                                algorithm_config);
558   }
559 
560   Status cudnn_launch_status;
561   if (CudnnUseFrontend()) {
562     if (algorithm_config.algorithm().has_value()) {
563       VLOG(4) << "Conv2DBackpropInput Execution Plan: "
564               << algorithm_config.algorithm()->exec_plan_id();
565     } else {
566       VLOG(4) << "Convolution Autotune has been turned off";
567     }
568     cudnn_launch_status = stream->ConvolveBackwardDataWithExecutionPlan(
569         filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
570         input_desc, &in_backprop_ptr, &scratch_allocator, algorithm_config,
571         nullptr);
572   } else {
573     cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
574         filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
575         input_desc, &in_backprop_ptr, &scratch_allocator, algorithm_config,
576         nullptr);
577   }
578 
579   if (!cudnn_launch_status.ok()) {
580     ctx->SetStatus(cudnn_launch_status);
581     return;
582   }
583 
584   if (padding_top != padding_bottom || padding_left != padding_right) {
585     Tensor in_backprop_remove_padding;
586     OP_REQUIRES_OK(
587         ctx, ctx->allocate_temp(
588                  DataTypeToEnum<T>::value,
589                  ShapeFromFormat(compute_data_format,
590                                  GetTensorDim(input_shape, data_format, 'N'),
591                                  GetTensorDim(input_shape, data_format, 'H'),
592                                  GetTensorDim(input_shape, data_format, 'W'),
593                                  GetTensorDim(input_shape, data_format, 'C')),
594                  &in_backprop_remove_padding));
595 
596     // Remove the padding that was added to the input shape above.
597     const int64_t input_pad_top = padding_top - common_padding_rows;
598     const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
599     const int64_t input_pad_left = padding_left - common_padding_cols;
600     const int64_t input_pad_right = padding_right - common_padding_cols;
601     functor::PadInput<GPUDevice, T, int, 4>()(
602         ctx->template eigen_device<GPUDevice>(),
603         To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
604                     .tensor<T, 4>()),
605         {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
606         {{static_cast<int>(-input_pad_bottom),
607           static_cast<int>(-input_pad_right)}},
608         To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
609         T{});
610 
611     pre_transformed_in_backprop = in_backprop_remove_padding;
612   }
613 
614   if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
615     VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
616     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
617     functor::NCHWToNHWC<GPUDevice, T, 4>()(
618         ctx->eigen_device<GPUDevice>(),
619         toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
620         in_backprop->tensor<T, 4>());
621   } else {
622     *in_backprop = pre_transformed_in_backprop;
623   }
624 }
625 
626 // Forward declarations of the functor specializations for GPU.
627 namespace functor {
628 #define DECLARE_GPU_SPEC(T)                                             \
629   template <>                                                           \
630   void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
631       const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
632       typename TTypes<T, 4, int>::ConstTensor in,                       \
633       typename TTypes<T, 4, int>::Tensor out);                          \
634   extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
635   template <>                                                           \
636   void PadInput<GPUDevice, T, int, 4>::operator()(                      \
637       const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
638       const std::array<int, 2>& padding_left,                           \
639       const std::array<int, 2>& padding_right,                          \
640       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
641       const T& padding_value);                                          \
642   extern template struct PadInput<GPUDevice, T, int, 4>;
643 
644 DECLARE_GPU_SPEC(float);
645 DECLARE_GPU_SPEC(Eigen::half);
646 DECLARE_GPU_SPEC(double);
647 #undef DECLARE_GPU_SPEC
648 
649 template <>
650 void SpatialConvolutionBackwardInputFunc<GPUDevice, int32>::operator()(
651     const GPUDevice&, typename TTypes<int32, 4>::Tensor,
652     typename TTypes<int32, 4>::ConstTensor,
653     typename TTypes<int32, 4>::ConstTensor, Eigen::DenseIndex,
654     Eigen::DenseIndex, Eigen::DenseIndex, Eigen::DenseIndex);
655 extern template struct SpatialConvolutionBackwardInputFunc<GPUDevice, int32>;
656 
657 template <>
658 void SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
659     GPUDevice, int32>::operator()(const GPUDevice&,
660                                   typename TTypes<int32, 4>::Tensor,
661                                   typename TTypes<int32, 4>::ConstTensor,
662                                   typename TTypes<int32, 4>::ConstTensor,
663                                   Eigen::DenseIndex, Eigen::DenseIndex,
664                                   Eigen::DenseIndex, Eigen::DenseIndex,
665                                   Eigen::DenseIndex, Eigen::DenseIndex,
666                                   Eigen::DenseIndex, Eigen::DenseIndex);
667 extern template struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
668     GPUDevice, int32>;
669 
670 }  // namespace functor
671 
672 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
673                             .Device(DEVICE_GPU)
674                             .TypeConstraint<double>("T")
675                             .HostMemory("input_sizes"),
676                         Conv2DBackpropInputOp<GPUDevice, double>);
677 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
678                             .Device(DEVICE_GPU)
679                             .TypeConstraint<float>("T")
680                             .HostMemory("input_sizes"),
681                         Conv2DBackpropInputOp<GPUDevice, float>);
682 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
683                             .Device(DEVICE_GPU)
684                             .TypeConstraint<Eigen::half>("T")
685                             .HostMemory("input_sizes"),
686                         Conv2DBackpropInputOp<GPUDevice, Eigen::half>);
687 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
688                             .Device(DEVICE_GPU)
689                             .TypeConstraint<int32>("T")
690                             .HostMemory("input_sizes"),
691                         Conv2DBackpropInputOp<GPUDevice, int32>);
692 
693 // To be used inside depthwise_conv_grad_op.cc.
694 // TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
695 template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
696 template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
697 template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
698 
699 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
700 
701 }  // namespace tensorflow
702