1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // See docs in ../ops/nn_ops.cc.
17
18 #include "tensorflow/core/kernels/conv_grad_input_ops.h"
19
20 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
21
22 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
23 #include "tensorflow/core/protobuf/autotuning.pb.h"
24 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
25 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
26
27 namespace tensorflow {
28
29 typedef Eigen::ThreadPoolDevice CPUDevice;
30 typedef Eigen::GpuDevice GPUDevice;
31
32 // To be used inside depthwise_conv_grad_op.cc.
33 template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
34 template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
35 template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
36
37 // GPU definitions.
38 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
39 // The slow version (but compiles for GPU)
40
41 // A dummy type to group forward backward data autotune results together.
42 struct ConvBackwardDataAutotuneGroup {
nametensorflow::ConvBackwardDataAutotuneGroup43 static string name() { return "ConvBwdData"; }
44 };
45
46 typedef AutotuneSingleton<ConvBackwardDataAutotuneGroup, ConvParameters,
47 se::dnn::AlgorithmConfig>
48 AutotuneConvBwdData;
49
50 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
51 // Computes backprop input using Eigen::SpatialConvolutionBackwardInput on GPU
52 // for int32 inputs.
53 template <>
54 struct LaunchConv2DBackpropInputOp<GPUDevice, int32> {
operator ()tensorflow::LaunchConv2DBackpropInputOp55 void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
56 const Tensor& out_backprop, const Tensor& filter,
57 int row_dilation, int col_dilation, int row_stride,
58 int col_stride, const Padding& padding,
59 const std::vector<int64>& explicit_paddings,
60 Tensor* in_backprop, TensorFormat data_format) {
61 LaunchConv2DBackpropInputOpImpl<GPUDevice, int32> launcher;
62 launcher(ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter,
63 row_dilation, col_dilation, row_stride, col_stride, padding,
64 explicit_paddings, in_backprop, data_format);
65 }
66 };
67 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
68
69 template <typename T>
operator ()(OpKernelContext * ctx,bool use_cudnn,bool cudnn_use_autotune,const Tensor & out_backprop,const Tensor & filter,int row_dilation,int col_dilation,int row_stride,int col_stride,const Padding & padding,const std::vector<int64> & explicit_paddings,Tensor * in_backprop,TensorFormat data_format)70 void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
71 OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
72 const Tensor& out_backprop, const Tensor& filter, int row_dilation,
73 int col_dilation, int row_stride, int col_stride, const Padding& padding,
74 const std::vector<int64>& explicit_paddings, Tensor* in_backprop,
75 TensorFormat data_format) {
76 using se::dnn::AlgorithmConfig;
77 using se::dnn::AlgorithmDesc;
78 using se::dnn::ProfileResult;
79
80 std::vector<int32> strides(4, 1);
81 std::vector<int32> dilations(4, 1);
82 auto input_h = GetTensorDimIndex(data_format, 'H');
83 auto input_w = GetTensorDimIndex(data_format, 'W');
84 strides[input_h] = row_stride;
85 strides[input_w] = col_stride;
86 dilations[input_h] = row_dilation;
87 dilations[input_w] = col_dilation;
88 TensorShape input_shape = in_backprop->shape();
89
90 const TensorShape& filter_shape = filter.shape();
91 ConvBackpropDimensions dims;
92 OP_REQUIRES_OK(
93 ctx, ConvBackpropComputeDimensionsV2(
94 "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, input_shape,
95 filter_shape, out_backprop.shape(), dilations, strides, padding,
96 explicit_paddings, data_format, &dims));
97
98 int64_t padding_top = -1, padding_bottom = -1;
99 int64_t padding_left = -1, padding_right = -1;
100 if (padding == EXPLICIT) {
101 GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
102 &padding_bottom);
103 GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
104 &padding_right);
105 }
106 int64_t expected_out_rows, expected_out_cols;
107 // The function is guaranteed to succeed because we checked the output and
108 // padding was valid earlier.
109 TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
110 dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
111 row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
112 &padding_bottom));
113 DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
114 TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
115 dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
116 col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
117 &padding_right));
118 DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
119
120 auto* stream = ctx->op_device_context()->stream();
121 OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
122
123 if (!use_cudnn) {
124 ctx->SetStatus(errors::Unimplemented(
125 "Conv2DBackpropInput for GPU is not currently supported "
126 "without cudnn"));
127 return;
128 }
129
130 // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
131 // input depth, it's a depthwise convolution. More generally, if the filter
132 // in-depth divides but is smaller than the input depth, it is a grouped
133 // convolution.
134 bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth;
135 if (dims.spatial_dims[0].filter_size == 1 &&
136 dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
137 dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
138 data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
139 // 1x1 filter, so call cublas directly.
140 const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
141 dims.spatial_dims[1].input_size;
142 const uint64 k = dims.out_depth;
143 const uint64 n = dims.in_depth;
144
145 auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
146 out_backprop.template flat<T>().size());
147 auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
148 filter.template flat<T>().size());
149 auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
150 in_backprop->template flat<T>().size());
151
152 auto transpose = se::blas::Transpose::kTranspose;
153 auto no_transpose = se::blas::Transpose::kNoTranspose;
154
155 OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k,
156 b_ptr, k, a_ptr, k, &c_ptr, n));
157 return;
158 } else if (dims.spatial_dims[0].filter_size ==
159 dims.spatial_dims[0].input_size &&
160 dims.spatial_dims[1].filter_size ==
161 dims.spatial_dims[1].input_size &&
162 !is_grouped_convolution && padding == VALID &&
163 data_format == FORMAT_NHWC) {
164 // The input data and filter have the same height/width, and we are not
165 // using grouped convolution, so call cublas directly.
166 const uint64 m = dims.batch_size;
167 const uint64 k = dims.out_depth;
168 const uint64 n = dims.spatial_dims[0].input_size *
169 dims.spatial_dims[1].input_size * dims.in_depth;
170
171 auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
172 out_backprop.template flat<T>().size());
173 auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
174 filter.template flat<T>().size());
175 auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
176 in_backprop->template flat<T>().size());
177
178 auto transpose = se::blas::Transpose::kTranspose;
179 auto no_transpose = se::blas::Transpose::kNoTranspose;
180
181 OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k,
182 b_ptr, k, a_ptr, k, &c_ptr, n));
183 return;
184 }
185
186 const int64_t common_padding_rows = std::min(padding_top, padding_bottom);
187 const int64_t common_padding_cols = std::min(padding_left, padding_right);
188 TensorShape compatible_input_shape;
189 if (padding_top != padding_bottom || padding_left != padding_right) {
190 // Pad the input in the same way we did during the forward pass, so that
191 // cuDNN or MIOpen receives the same input during the backward pass function
192 // as it did during the forward pass function.
193 const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top);
194 const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
195 const int64_t new_in_rows =
196 dims.spatial_dims[0].input_size + padding_rows_diff;
197 const int64_t new_in_cols =
198 dims.spatial_dims[1].input_size + padding_cols_diff;
199 compatible_input_shape = ShapeFromFormat(
200 data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth);
201 } else {
202 compatible_input_shape = input_shape;
203 }
204
205 CHECK(common_padding_rows >= 0 && common_padding_cols >= 0) // Crash OK
206 << "Negative row or col paddings: (" << common_padding_rows << ", "
207 << common_padding_cols << ")";
208
209 // The Tensor Core in NVIDIA Volta+ GPUs supports efficient convolution with
210 // fp16 in NHWC data layout. In all other configurations it's more efficient
211 // to run computation in NCHW data format.
212 const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
213 stream->GetCudaComputeCapability().IsAtLeast(
214 se::CudaComputeCapability::VOLTA);
215
216 // We only do one directional conversion: NHWC->NCHW. We never convert in the
217 // other direction. Grappler layout optimizer selects the preferred layout and
218 // adds necessary annotations to the graph.
219 const TensorFormat compute_data_format =
220 (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
221 : FORMAT_NCHW;
222
223 VLOG(3) << "Compute Conv2DBackpropInput with cuDNN:"
224 << " data_format=" << ToString(data_format)
225 << " compute_data_format=" << ToString(compute_data_format);
226
227 constexpr auto kComputeInNHWC =
228 std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
229 se::dnn::FilterLayout::kOutputYXInput);
230 constexpr auto kComputeInNCHW =
231 std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
232 se::dnn::FilterLayout::kOutputInputYX);
233
234 se::dnn::DataLayout compute_data_layout;
235 se::dnn::FilterLayout filter_layout;
236
237 std::tie(compute_data_layout, filter_layout) =
238 compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
239
240 se::dnn::BatchDescriptor input_desc;
241 input_desc.set_count(dims.batch_size)
242 .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
243 .set_width(GetTensorDim(compatible_input_shape, data_format, 'W'))
244 .set_feature_map_count(dims.in_depth)
245 .set_layout(compute_data_layout);
246 se::dnn::BatchDescriptor output_desc;
247 output_desc.set_count(dims.batch_size)
248 .set_height(dims.spatial_dims[0].output_size)
249 .set_width(dims.spatial_dims[1].output_size)
250 .set_feature_map_count(dims.out_depth)
251 .set_layout(compute_data_layout);
252 se::dnn::FilterDescriptor filter_desc;
253 filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
254 .set_input_filter_width(dims.spatial_dims[1].filter_size)
255 .set_input_feature_map_count(filter_shape.dim_size(2))
256 .set_output_feature_map_count(filter_shape.dim_size(3))
257 .set_layout(filter_layout);
258 se::dnn::ConvolutionDescriptor conv_desc;
259 conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
260 .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
261 .set_vertical_filter_stride(dims.spatial_dims[0].stride)
262 .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
263 .set_zero_padding_height(common_padding_rows)
264 .set_zero_padding_width(common_padding_cols)
265 .set_group_count(dims.in_depth / filter_shape.dim_size(2));
266
267 // Tensorflow filter format: HWIO
268 // cuDNN filter formats: (data format) -> (filter format)
269 // (1) NCHW -> OIHW
270 // (2) NHWC -> OHWI
271
272 Tensor transformed_filter;
273 const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
274 VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
275 << " to " << ToString(dst_format);
276
277 TensorShape dst_shape =
278 dst_format == FORMAT_OIHW
279 ? TensorShape({filter.dim_size(3), filter.dim_size(2),
280 filter.dim_size(0), filter.dim_size(1)})
281 : TensorShape({filter.dim_size(3), filter.dim_size(0),
282 filter.dim_size(1), filter.dim_size(2)});
283
284 TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
285 &transformed_filter));
286 functor::TransformFilter<GPUDevice, T, int, 4>()(
287 ctx->eigen_device<GPUDevice>(), dst_format,
288 To32Bit(filter.tensor<T, 4>()),
289 To32Bit(transformed_filter.tensor<T, 4>()));
290
291 return Status::OK();
292 };
293
294 if (compute_data_format == FORMAT_NCHW) {
295 OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW));
296 } else if (compute_data_format == FORMAT_NHWC) {
297 OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI));
298 } else {
299 ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
300 ToString(compute_data_format)));
301 return;
302 }
303
304 Tensor transformed_out_backprop;
305 if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
306 VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW.";
307 TensorShape compute_shape = ShapeFromFormat(
308 compute_data_format, dims.batch_size, dims.spatial_dims[0].output_size,
309 dims.spatial_dims[1].output_size, dims.out_depth);
310 if (dims.out_depth > 1) {
311 OP_REQUIRES_OK(ctx,
312 ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
313 &transformed_out_backprop));
314 functor::NHWCToNCHW<GPUDevice, T, 4>()(
315 ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
316 transformed_out_backprop.tensor<T, 4>());
317 } else {
318 // If depth <= 1, then just reshape.
319 CHECK(transformed_out_backprop.CopyFrom(out_backprop, compute_shape));
320 }
321 } else {
322 transformed_out_backprop = out_backprop;
323 }
324
325 Tensor pre_transformed_in_backprop;
326 OP_REQUIRES_OK(
327 ctx, ctx->allocate_temp(
328 DataTypeToEnum<T>::value,
329 ShapeFromFormat(
330 compute_data_format,
331 GetTensorDim(compatible_input_shape, data_format, 'N'),
332 GetTensorDim(compatible_input_shape, data_format, 'H'),
333 GetTensorDim(compatible_input_shape, data_format, 'W'),
334 GetTensorDim(compatible_input_shape, data_format, 'C')),
335 &pre_transformed_in_backprop));
336
337 auto out_backprop_ptr =
338 AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
339 transformed_out_backprop.template flat<T>().size());
340 auto filter_ptr =
341 AsDeviceMemory(transformed_filter.template flat<T>().data(),
342 transformed_filter.template flat<T>().size());
343 auto in_backprop_ptr =
344 AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
345 pre_transformed_in_backprop.template flat<T>().size());
346
347 int64_t workspace_bytes = 1LL << 32; // 4GB by default.
348 // CuDNN frontend will expose more engines some of which might use too much
349 // workspace. This would increase the overall demand of memory when training
350 // models.
351 if (CudnnUseFrontend()) {
352 workspace_bytes = 1LL << 30; // 1GB by default.
353 }
354 static int64_t ConvolveBackwardDataScratchSize =
355 GetDnnWorkspaceLimit("TF_CUDNN_WORKSPACE_LIMIT_IN_MB", workspace_bytes);
356 DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
357 int device_id = stream->parent()->device_ordinal();
358 DataType dtype = out_backprop.dtype();
359 ConvParameters conv_parameters = {
360 dims.batch_size, // batch
361 dims.in_depth, // in_depths
362 {{input_desc.height(), // in_rows
363 input_desc.width()}}, // in_cols
364 compute_data_format, // compute_data_format
365 dims.out_depth, // out_depths
366 {{dims.spatial_dims[0].filter_size, // filter_rows
367 dims.spatial_dims[1].filter_size, // filter_cols
368 filter_shape.dim_size(2)}}, // filter_depths
369 {{dims.spatial_dims[0].dilation, // dilation_rows
370 dims.spatial_dims[1].dilation}}, // dilation_cols
371 {{dims.spatial_dims[0].stride, // stride_rows
372 dims.spatial_dims[1].stride}}, // stride_cols
373 {{common_padding_rows, // padding_rows
374 common_padding_cols}}, // padding_cols
375 dtype, // tensor data type
376 device_id, // device_id
377 conv_desc.group_count() // group_count
378 };
379 #if TENSORFLOW_USE_ROCM
380 // cudnn_use_autotune is applicable only the CUDA flow
381 // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
382 // if we do not have a cached algorithm_config for this conv_parameters
383 cudnn_use_autotune = true;
384 #endif
385 AlgorithmConfig algorithm_config;
386
387 if (cudnn_use_autotune && !AutotuneConvBwdData::GetInstance()->Find(
388 conv_parameters, &algorithm_config)) {
389 profiler::ScopedAnnotation trace("cudnn_autotuning");
390
391 std::vector<std::unique_ptr<se::dnn::ConvolveExecutionPlan>> plans;
392 #if GOOGLE_CUDA
393 std::vector<AlgorithmDesc> algorithms;
394 std::vector<AlgorithmConfig> configs;
395 if (CudnnUseFrontend()) {
396 OP_REQUIRES(
397 ctx,
398 stream->parent()->GetConvolveExecutionPlans(
399 se::dnn::ConvolutionKind::BACKWARD_DATA,
400 se::dnn::ToDataType<T>::value, stream, input_desc, filter_desc,
401 output_desc, conv_desc, &plans),
402 errors::Unknown("Failed to get convolution execution plan. This is "
403 "probably because cuDNN failed to initialize, so try "
404 "looking to see if a warning log message was printed "
405 "above."));
406 for (const auto& plan : plans) {
407 configs.push_back(
408 AlgorithmConfig(AlgorithmDesc{plan->getTag(), plan->get_raw_desc()},
409 plan->getWorkspaceSize()));
410 }
411 } else {
412 OP_REQUIRES(
413 ctx, stream->parent()->GetConvolveBackwardDataAlgorithms(&algorithms),
414 errors::Unknown("Failed to get convolution execution plan. This is "
415 "probably because cuDNN failed to initialize, so try "
416 "looking to see if a warning log message was printed "
417 "above."));
418 for (const auto& algorithm : algorithms) {
419 configs.push_back(AlgorithmConfig(algorithm));
420 }
421 }
422
423 se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
424 stream);
425
426 se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
427 se::GpuAsmOpts());
428
429 se::DeviceMemory<T> in_backprop_ptr_rz(
430 WrapRedzoneBestEffort(&rz_allocator, in_backprop_ptr));
431
432 std::vector<tensorflow::AutotuneResult> results;
433 for (auto& profile_config : configs) {
434 // TODO(zhengxq): profile each algorithm multiple times to better
435 // accuracy.
436 DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
437 ctx);
438 se::RedzoneAllocator rz_scratch_allocator(
439 stream, &tf_allocator_adapter, se::GpuAsmOpts(),
440 /*memory_limit=*/ConvolveBackwardDataScratchSize);
441 se::ScratchAllocator* allocator_used =
442 !RedzoneCheckDisabled()
443 ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
444 : static_cast<se::ScratchAllocator*>(&scratch_allocator);
445 ProfileResult profile_result;
446 Status cudnn_launch_status;
447 if (CudnnUseFrontend()) {
448 cudnn_launch_status = stream->ConvolveBackwardDataWithExecutionPlan(
449 filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
450 input_desc, &in_backprop_ptr_rz, allocator_used, profile_config,
451 &profile_result);
452 } else {
453 cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
454 filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
455 input_desc, &in_backprop_ptr_rz, allocator_used, profile_config,
456 &profile_result);
457 }
458
459 if (cudnn_launch_status.ok() && profile_result.is_valid()) {
460 results.emplace_back();
461 auto& result = results.back();
462 if (CudnnUseFrontend()) {
463 result.mutable_cuda_conv_plan()->set_exec_plan_id(
464 profile_config.algorithm()->exec_plan_id());
465 } else {
466 result.mutable_conv()->set_algorithm(
467 profile_config.algorithm()->algo_id());
468 result.mutable_conv()->set_tensor_ops_enabled(
469 profile_config.algorithm()->tensor_ops_enabled());
470 }
471
472 result.set_scratch_bytes(
473 !RedzoneCheckDisabled()
474 ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
475 : scratch_allocator.TotalByteSize());
476 *result.mutable_run_time() = proto_utils::ToDurationProto(
477 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
478
479 CheckRedzones(rz_scratch_allocator, &result);
480 CheckRedzones(rz_allocator, &result);
481 } else if (CudnnUseFrontend()) {
482 // When CuDNN frontend APIs are used, we need to make sure the profiling
483 // results are one-to-one mapping of the "plans". So, we insert dummy
484 // results when the excution fails.
485 results.emplace_back();
486 auto& result = results.back();
487 result.mutable_failure()->set_kind(AutotuneResult::UNKNOWN);
488 result.mutable_failure()->set_msg(
489 absl::StrCat("Profiling failure on CUDNN engine: ",
490 profile_config.algorithm()->exec_plan_id()));
491 }
492 }
493 #elif TENSORFLOW_USE_ROCM
494 DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
495 std::vector<ProfileResult> algorithms;
496 OP_REQUIRES(
497 ctx,
498 stream->parent()->GetMIOpenConvolveAlgorithms(
499 se::dnn::ConvolutionKind::BACKWARD_DATA,
500 se::dnn::ToDataType<T>::value, stream, input_desc, in_backprop_ptr,
501 filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
502 &scratch_allocator, &algorithms),
503 errors::Unknown(
504 "Failed to get convolution algorithm. This is probably "
505 "because MIOpen failed to initialize, so try looking to "
506 "see if a warning log message was printed above."));
507
508 std::vector<tensorflow::AutotuneResult> results;
509 if (algorithms.size() == 1) {
510 auto profile_result = algorithms[0];
511 results.emplace_back();
512 auto& result = results.back();
513 result.mutable_conv()->set_algorithm(
514 profile_result.algorithm().algo_id());
515 result.mutable_conv()->set_tensor_ops_enabled(
516 profile_result.algorithm().tensor_ops_enabled());
517
518 result.set_scratch_bytes(profile_result.scratch_size());
519 *result.mutable_run_time() = proto_utils::ToDurationProto(
520 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
521 } else {
522 for (auto miopen_algorithm : algorithms) {
523 auto profile_algorithm = miopen_algorithm.algorithm();
524 ProfileResult profile_result;
525 auto miopen_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
526 filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
527 input_desc, &in_backprop_ptr, &scratch_allocator,
528 AlgorithmConfig(profile_algorithm, miopen_algorithm.scratch_size()),
529 &profile_result);
530
531 if (miopen_launch_status.ok() && profile_result.is_valid()) {
532 results.emplace_back();
533 auto& result = results.back();
534 result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
535 result.mutable_conv()->set_tensor_ops_enabled(
536 profile_algorithm.tensor_ops_enabled());
537 result.set_scratch_bytes(scratch_allocator.TotalByteSize());
538 *result.mutable_run_time() = proto_utils::ToDurationProto(
539 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
540 }
541 }
542 }
543 #endif
544 LogConvAutotuneResults(
545 se::dnn::ConvolutionKind::BACKWARD_DATA, se::dnn::ToDataType<T>::value,
546 in_backprop_ptr, filter_ptr, out_backprop_ptr, input_desc, filter_desc,
547 output_desc, conv_desc, stream->parent(), results);
548 if (CudnnUseFrontend()) {
549 OP_REQUIRES_OK(
550 ctx, BestCudnnConvAlgorithm(results, &plans, &algorithm_config));
551
552 } else {
553 OP_REQUIRES_OK(
554 ctx, BestCudnnConvAlgorithm(results, nullptr, &algorithm_config));
555 }
556 AutotuneConvBwdData::GetInstance()->Insert(conv_parameters,
557 algorithm_config);
558 }
559
560 Status cudnn_launch_status;
561 if (CudnnUseFrontend()) {
562 if (algorithm_config.algorithm().has_value()) {
563 VLOG(4) << "Conv2DBackpropInput Execution Plan: "
564 << algorithm_config.algorithm()->exec_plan_id();
565 } else {
566 VLOG(4) << "Convolution Autotune has been turned off";
567 }
568 cudnn_launch_status = stream->ConvolveBackwardDataWithExecutionPlan(
569 filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
570 input_desc, &in_backprop_ptr, &scratch_allocator, algorithm_config,
571 nullptr);
572 } else {
573 cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
574 filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
575 input_desc, &in_backprop_ptr, &scratch_allocator, algorithm_config,
576 nullptr);
577 }
578
579 if (!cudnn_launch_status.ok()) {
580 ctx->SetStatus(cudnn_launch_status);
581 return;
582 }
583
584 if (padding_top != padding_bottom || padding_left != padding_right) {
585 Tensor in_backprop_remove_padding;
586 OP_REQUIRES_OK(
587 ctx, ctx->allocate_temp(
588 DataTypeToEnum<T>::value,
589 ShapeFromFormat(compute_data_format,
590 GetTensorDim(input_shape, data_format, 'N'),
591 GetTensorDim(input_shape, data_format, 'H'),
592 GetTensorDim(input_shape, data_format, 'W'),
593 GetTensorDim(input_shape, data_format, 'C')),
594 &in_backprop_remove_padding));
595
596 // Remove the padding that was added to the input shape above.
597 const int64_t input_pad_top = padding_top - common_padding_rows;
598 const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
599 const int64_t input_pad_left = padding_left - common_padding_cols;
600 const int64_t input_pad_right = padding_right - common_padding_cols;
601 functor::PadInput<GPUDevice, T, int, 4>()(
602 ctx->template eigen_device<GPUDevice>(),
603 To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
604 .tensor<T, 4>()),
605 {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
606 {{static_cast<int>(-input_pad_bottom),
607 static_cast<int>(-input_pad_right)}},
608 To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
609 T{});
610
611 pre_transformed_in_backprop = in_backprop_remove_padding;
612 }
613
614 if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
615 VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
616 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
617 functor::NCHWToNHWC<GPUDevice, T, 4>()(
618 ctx->eigen_device<GPUDevice>(),
619 toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
620 in_backprop->tensor<T, 4>());
621 } else {
622 *in_backprop = pre_transformed_in_backprop;
623 }
624 }
625
626 // Forward declarations of the functor specializations for GPU.
627 namespace functor {
628 #define DECLARE_GPU_SPEC(T) \
629 template <> \
630 void TransformFilter<GPUDevice, T, int, 4>::operator()( \
631 const GPUDevice& d, FilterTensorFormat dst_filter_format, \
632 typename TTypes<T, 4, int>::ConstTensor in, \
633 typename TTypes<T, 4, int>::Tensor out); \
634 extern template struct TransformFilter<GPUDevice, T, int, 4>; \
635 template <> \
636 void PadInput<GPUDevice, T, int, 4>::operator()( \
637 const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \
638 const std::array<int, 2>& padding_left, \
639 const std::array<int, 2>& padding_right, \
640 typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
641 const T& padding_value); \
642 extern template struct PadInput<GPUDevice, T, int, 4>;
643
644 DECLARE_GPU_SPEC(float);
645 DECLARE_GPU_SPEC(Eigen::half);
646 DECLARE_GPU_SPEC(double);
647 #undef DECLARE_GPU_SPEC
648
649 template <>
650 void SpatialConvolutionBackwardInputFunc<GPUDevice, int32>::operator()(
651 const GPUDevice&, typename TTypes<int32, 4>::Tensor,
652 typename TTypes<int32, 4>::ConstTensor,
653 typename TTypes<int32, 4>::ConstTensor, Eigen::DenseIndex,
654 Eigen::DenseIndex, Eigen::DenseIndex, Eigen::DenseIndex);
655 extern template struct SpatialConvolutionBackwardInputFunc<GPUDevice, int32>;
656
657 template <>
658 void SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
659 GPUDevice, int32>::operator()(const GPUDevice&,
660 typename TTypes<int32, 4>::Tensor,
661 typename TTypes<int32, 4>::ConstTensor,
662 typename TTypes<int32, 4>::ConstTensor,
663 Eigen::DenseIndex, Eigen::DenseIndex,
664 Eigen::DenseIndex, Eigen::DenseIndex,
665 Eigen::DenseIndex, Eigen::DenseIndex,
666 Eigen::DenseIndex, Eigen::DenseIndex);
667 extern template struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
668 GPUDevice, int32>;
669
670 } // namespace functor
671
672 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
673 .Device(DEVICE_GPU)
674 .TypeConstraint<double>("T")
675 .HostMemory("input_sizes"),
676 Conv2DBackpropInputOp<GPUDevice, double>);
677 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
678 .Device(DEVICE_GPU)
679 .TypeConstraint<float>("T")
680 .HostMemory("input_sizes"),
681 Conv2DBackpropInputOp<GPUDevice, float>);
682 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
683 .Device(DEVICE_GPU)
684 .TypeConstraint<Eigen::half>("T")
685 .HostMemory("input_sizes"),
686 Conv2DBackpropInputOp<GPUDevice, Eigen::half>);
687 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
688 .Device(DEVICE_GPU)
689 .TypeConstraint<int32>("T")
690 .HostMemory("input_sizes"),
691 Conv2DBackpropInputOp<GPUDevice, int32>);
692
693 // To be used inside depthwise_conv_grad_op.cc.
694 // TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
695 template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
696 template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
697 template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
698
699 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
700
701 } // namespace tensorflow
702