1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/kernels/pooling_ops_common.h"
17
18 #include <vector>
19
20 #include "tensorflow/core/common_runtime/device.h"
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/kernel_shape_util.h"
23 #include "tensorflow/core/framework/register_types.h"
24 #include "tensorflow/core/framework/tensor.h"
25
26 #if GOOGLE_CUDA
27 #include "third_party/gpus/cudnn/cudnn.h"
28 #endif // GOOGLE_CUDA
29 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
30 #include "tensorflow/core/kernels/conv_2d.h"
31 #include "tensorflow/core/kernels/gpu_utils.h"
32 #if TENSORFLOW_USE_ROCM
33 #include "tensorflow/core/kernels/conv_ops_gpu.h"
34 #endif
35 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
36 #include "tensorflow/core/platform/stream_executor.h"
37 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38
39 namespace tensorflow {
40
41 namespace {
42
43 template <typename T>
44 struct RawType {
45 using type = T;
46 };
47
48 template <>
49 struct RawType<qint8> {
50 using type = int8;
51 };
52
53 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
54
55 template <typename T>
56 struct PadInputWithNegativeInf {
operator ()tensorflow::__anon5ebeb0010111::PadInputWithNegativeInf57 Status operator()(const GPUDevice& d,
58 typename TTypes<T, 4, int>::ConstTensor in,
59 int input_pad_top, int input_pad_bottom, int input_pad_left,
60 int input_pad_right, typename TTypes<T, 4, int>::Tensor out,
61 TensorFormat format) {
62 T padding_value = -std::numeric_limits<T>::infinity();
63 functor::PadInput<GPUDevice, T, int, 4>()(
64 d, in, {{input_pad_top, input_pad_left}},
65 {{input_pad_bottom, input_pad_right}}, out, format, padding_value);
66 return Status::OK();
67 }
68 };
69
70 template <>
71 struct PadInputWithNegativeInf<qint8> {
operator ()tensorflow::__anon5ebeb0010111::PadInputWithNegativeInf72 Status operator()(const GPUDevice& d,
73 typename TTypes<qint8, 4, int>::ConstTensor in,
74 int input_pad_top, int input_pad_bottom, int input_pad_left,
75 int input_pad_right,
76 typename TTypes<qint8, 4, int>::Tensor out,
77 TensorFormat format) {
78 return errors::InvalidArgument(
79 "Explicit padding not yet supported with qint8");
80 }
81 };
82
83 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
84
85 } // namespace
86
CheckPaddingSize(int64_t window_rows,int64_t window_cols,int64_t pad_top,int64_t pad_bottom,int64_t pad_left,int64_t pad_right)87 Status CheckPaddingSize(int64_t window_rows, int64_t window_cols,
88 int64_t pad_top, int64_t pad_bottom, int64_t pad_left,
89 int64_t pad_right) {
90 if (!FastBoundsCheck(pad_top, window_rows)) {
91 return errors::InvalidArgument("Top padding ", pad_top,
92 " needs to be smaller than the "
93 "window size ",
94 window_rows);
95 }
96 if (!FastBoundsCheck(pad_bottom, window_rows)) {
97 return errors::InvalidArgument("Bottom padding ", pad_bottom,
98 " needs to be smaller than the "
99 "window size ",
100 window_rows);
101 }
102 if (!FastBoundsCheck(pad_left, window_cols)) {
103 return errors::InvalidArgument("Left padding ", pad_left,
104 " needs to be smaller than the "
105 "window size ",
106 window_cols);
107 }
108 if (!FastBoundsCheck(pad_right, window_cols)) {
109 return errors::InvalidArgument("Right padding ", pad_right,
110 " needs to be smaller than the "
111 "window size ",
112 window_cols);
113 }
114 return Status::OK();
115 }
116
PoolParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,std::vector<int64> explicit_paddings,TensorFormat data_format,const TensorShape & tensor_in_shape)117 PoolParameters::PoolParameters(OpKernelContext* context,
118 const std::vector<int32>& ksize,
119 const std::vector<int32>& stride,
120 Padding padding,
121 std::vector<int64> explicit_paddings,
122 TensorFormat data_format,
123 const TensorShape& tensor_in_shape) {
124 // For maxpooling, tensor_in should have 2 spatial dimensions.
125 // Note: the total number of dimensions could be 4 for NHWC, NCHW,
126 // or 5 for NCHW_VECT_C.
127 OP_REQUIRES(context,
128 GetTensorSpatialDims(tensor_in_shape.dims(), data_format) == 2,
129 errors::InvalidArgument(
130 "tensor_in_shape must have 2 spatial dimensions. ",
131 tensor_in_shape.dims(), " ", data_format));
132
133 this->data_format = data_format;
134 depth = GetTensorDim(tensor_in_shape, data_format, 'C') *
135 (data_format == FORMAT_NCHW_VECT_C ? 4 : 1);
136 tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
137 tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
138 tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
139 window_rows = GetTensorDim(ksize, data_format, 'H');
140 window_cols = GetTensorDim(ksize, data_format, 'W');
141 depth_window = GetTensorDim(ksize, data_format, 'C');
142 row_stride = GetTensorDim(stride, data_format, 'H');
143 col_stride = GetTensorDim(stride, data_format, 'W');
144 depth_stride = GetTensorDim(stride, data_format, 'C');
145
146 // We only support 2D pooling across width/height and depthwise
147 // pooling, not a combination.
148 OP_REQUIRES(context,
149 (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
150 errors::Unimplemented(
151 "MaxPooling supports exactly one of pooling across depth "
152 "or pooling across width/height."));
153 if (padding == Padding::EXPLICIT) {
154 OP_REQUIRES_OK(context, CheckValidPadding(padding, explicit_paddings,
155 /*num_dims=*/4, data_format));
156 GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &pad_top,
157 &pad_bottom);
158 GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &pad_left,
159 &pad_right);
160 OP_REQUIRES_OK(context, CheckPaddingSize(window_rows, window_cols, pad_top,
161 pad_bottom, pad_left, pad_right));
162 }
163
164 if (depth_window == 1) {
165 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
166 tensor_in_rows, window_rows, row_stride,
167 padding, &out_height, &pad_top, &pad_bottom));
168 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
169 tensor_in_cols, window_cols, col_stride,
170 padding, &out_width, &pad_left, &pad_right));
171 pad_depth = 0;
172 out_depth = depth;
173 } else {
174 OP_REQUIRES(context, depth_window > 0,
175 errors::InvalidArgument("depth_window must not be 0"));
176 // Our current version of depthwise max pooling does not support
177 // any padding, and expects the depth_window to equal the
178 // depth_stride (no overlapping).
179 OP_REQUIRES(
180 context, depth % depth_window == 0,
181 errors::Unimplemented("Depthwise max pooling requires the depth "
182 "window to evenly divide the input depth"));
183 OP_REQUIRES(
184 context, depth_stride == depth_window,
185 errors::Unimplemented("Depthwise max pooling requires the depth "
186 "window to equal the depth stride"));
187
188 // The current version of depthwise max is only implemented on CPU.
189 OP_REQUIRES(context,
190 (DeviceType(static_cast<Device*>(context->device())
191 ->attributes()
192 .device_type()) == DeviceType(DEVICE_CPU)),
193 errors::Unimplemented("Depthwise max pooling is currently "
194 "only implemented for CPU devices."));
195
196 pad_depth = 0;
197 out_depth = depth / depth_window;
198 }
199 }
200
forward_output_shape()201 TensorShape PoolParameters::forward_output_shape() {
202 if (depth_window == 1) {
203 // Spatial pooling
204 return ShapeFromFormat(data_format, tensor_in_batch, out_height, out_width,
205 depth);
206 } else {
207 // Depthwise pooling
208 return TensorShape(
209 {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
210 }
211 }
212
213 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
214
215 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,std::vector<int64> explicit_paddings,TensorFormat data_format,const Tensor & tensor_in,const TensorShape & tensor_out_shape,bool propagate_nans)216 void DnnPoolingOp<T>::Compute(OpKernelContext* context,
217 se::dnn::PoolingMode pooling_mode,
218 const std::vector<int32>& size,
219 const std::vector<int32>& stride, Padding padding,
220 std::vector<int64> explicit_paddings,
221 TensorFormat data_format, const Tensor& tensor_in,
222 const TensorShape& tensor_out_shape,
223 bool propagate_nans) {
224 Tensor* tensor_out = nullptr;
225 OP_REQUIRES_OK(context,
226 context->allocate_output(0, tensor_out_shape, &tensor_out));
227 if (tensor_in.shape().num_elements() == 0) {
228 return;
229 }
230
231 PoolParameters params{
232 context, size, stride, padding,
233 explicit_paddings, data_format, tensor_in.shape()};
234 if (!context->status().ok()) {
235 return;
236 }
237
238 int batch_size = params.tensor_in_batch;
239 int depth = params.depth;
240 int tensor_in_cols = params.tensor_in_cols;
241 int tensor_in_rows = params.tensor_in_rows;
242
243 #if CUDNN_VERSION < 7300
244 /// Earlier versions do not support NHWC format, so we need to convert it
245 /// to NCHW before calling cudnn. We need to get rid of this once it is done
246 Tensor transformed_input;
247 if (data_format == FORMAT_NHWC) {
248 OP_REQUIRES_OK(context, context->allocate_temp(
249 DataTypeToEnum<T>::value,
250 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
251 data_format),
252 &transformed_input));
253 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
254 tensor_in.tensor<T, 4>(),
255 transformed_input.tensor<T, 4>());
256 } else {
257 transformed_input = tensor_in;
258 }
259 Tensor transformed_output;
260 if (data_format == FORMAT_NHWC) {
261 OP_REQUIRES_OK(context, context->allocate_temp(
262 DataTypeToEnum<T>::value,
263 ShapeFromFormat(FORMAT_NCHW, tensor_out_shape,
264 data_format),
265 &transformed_output));
266 } else {
267 transformed_output = *tensor_out;
268 }
269 se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
270 #else
271 Tensor transformed_input = tensor_in;
272 auto& transformed_output = *tensor_out;
273 se::dnn::DataLayout data_layout;
274 switch (data_format) {
275 case FORMAT_NHWC:
276 data_layout = se::dnn::DataLayout::kBatchYXDepth;
277 break;
278 case FORMAT_NCHW:
279 data_layout = se::dnn::DataLayout::kBatchDepthYX;
280 break;
281 case FORMAT_NCHW_VECT_C:
282 // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
283 // emulated via NHWC.
284 data_layout = se::dnn::DataLayout::kBatchYXDepth;
285 batch_size *= depth / 4;
286 depth = 4;
287 break;
288 default:
289 OP_REQUIRES(context, false,
290 errors::InvalidArgument("Unsupported format: ",
291 ToString(data_format)));
292 }
293 #endif
294
295 int64_t vertical_padding = params.pad_top;
296 int64_t horizontal_padding = params.pad_left;
297
298 if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
299 params.pad_left != params.pad_right)) {
300 // cuDNN only supports padding the same amount on the left and right sides,
301 // and on the top and bottom sides. So we manually create a new padded
302 // input tensor such that we can pass it to cuDNN.
303 const int64_t common_padding_rows =
304 std::min(params.pad_top, params.pad_bottom);
305 const int64_t common_padding_cols =
306 std::min(params.pad_left, params.pad_right);
307
308 Tensor padded_input;
309 const int64_t padding_rows_diff =
310 std::abs(params.pad_top - params.pad_bottom);
311 const int64_t padding_cols_diff =
312 std::abs(params.pad_left - params.pad_right);
313
314 const int64_t new_in_rows = tensor_in_rows + padding_rows_diff;
315 const int64_t new_in_cols = tensor_in_cols + padding_cols_diff;
316
317 OP_REQUIRES_OK(
318 context,
319 context->allocate_temp(DataTypeToEnum<T>::value,
320 ShapeFromFormat(data_format, batch_size,
321 new_in_rows, new_in_cols, depth),
322 &padded_input));
323 const int64_t input_pad_top = params.pad_top - common_padding_rows;
324 const int64_t input_pad_bottom = params.pad_bottom - common_padding_rows;
325 const int64_t input_pad_left = params.pad_left - common_padding_cols;
326 const int64_t input_pad_right = params.pad_right - common_padding_cols;
327
328 bool in_bounds =
329 FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
330 FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
331 FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
332 FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
333 if (!in_bounds) {
334 context->SetStatus(errors::InvalidArgument("Padding is too large."));
335 return;
336 }
337
338 // We need to call the const version of transformed_input.tensor()
339 const Tensor& const_transformed_input = transformed_input;
340 OP_REQUIRES_OK(
341 context,
342 PadInputWithNegativeInf<T>()(
343 context->eigen_device<GPUDevice>(),
344 To32Bit(const_transformed_input.tensor<T, 4>()),
345 static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
346 static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
347 To32Bit(padded_input.tensor<T, 4>()), data_format));
348 transformed_input = padded_input;
349 vertical_padding = common_padding_rows;
350 horizontal_padding = common_padding_cols;
351 tensor_in_rows = new_in_rows;
352 tensor_in_cols = new_in_cols;
353 }
354
355 se::dnn::PoolingDescriptor pooling_desc;
356 pooling_desc.set_pooling_mode(pooling_mode)
357 .set_window_height(params.window_rows)
358 .set_window_width(params.window_cols)
359 .set_vertical_stride(params.row_stride)
360 .set_horizontal_stride(params.col_stride)
361 .set_vertical_padding(vertical_padding)
362 .set_horizontal_padding(horizontal_padding)
363 .set_propagate_nans(propagate_nans);
364
365 se::dnn::BatchDescriptor input_desc;
366 input_desc.set_count(batch_size)
367 .set_height(tensor_in_rows)
368 .set_width(tensor_in_cols)
369 .set_feature_map_count(depth)
370 .set_layout(data_layout);
371
372 se::dnn::BatchDescriptor output_desc;
373 output_desc.set_count(batch_size)
374 .set_height(params.out_height)
375 .set_width(params.out_width)
376 .set_feature_map_count(depth)
377 .set_layout(data_layout);
378
379 auto input_data =
380 AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
381 transformed_input.template flat<T>().data()),
382 transformed_input.template flat<T>().size());
383
384 auto output_data =
385 AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
386 transformed_output.template flat<T>().data()),
387 transformed_output.template flat<T>().size());
388
389 auto* stream = context->op_device_context()->stream();
390 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
391
392 #if TENSORFLOW_USE_ROCM
393 static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
394 // default value is in bytes despite the name of the environment variable
395 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
396 );
397
398 DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
399 bool status =
400 stream
401 ->ThenPoolForward(pooling_desc, input_desc, input_data, output_desc,
402 &output_data, &scratch_allocator)
403 .ok();
404 #else
405 bool status = stream
406 ->ThenPoolForward(pooling_desc, input_desc, input_data,
407 output_desc, &output_data)
408 .ok();
409 #endif
410 OP_REQUIRES(context, status,
411 errors::Internal("dnn PoolForward launch failed"));
412 #if CUDNN_VERSION < 7300
413 if (data_format == FORMAT_NHWC) {
414 /// Transform the output data from NCHW back to NHWC
415 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
416 using RT = typename RawType<T>::type;
417 functor::NCHWToNHWC<GPUDevice, RT, 4>()(
418 context->eigen_device<Device>(),
419 toConstTensor(transformed_output).template tensor<RT, 4>(),
420 tensor_out->tensor<RT, 4>());
421 }
422 #endif
423 }
424
425 // Forward declarations of the functor specializations for GPU.
426 namespace functor {
427 #define DECLARE_GPU_SPEC(T) \
428 template <> \
429 void PadInput<GPUDevice, T, int, 4>::operator()( \
430 const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \
431 const std::array<int, 2>& padding_left, \
432 const std::array<int, 2>& padding_right, \
433 typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
434 const T& padding_value); \
435 extern template struct PadInput<GPUDevice, T, int, 4>;
436
437 DECLARE_GPU_SPEC(float);
438 DECLARE_GPU_SPEC(Eigen::half);
439 DECLARE_GPU_SPEC(double);
440 DECLARE_GPU_SPEC(int32);
441 } // namespace functor
442
443 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,std::vector<int64> explicit_paddings,TensorFormat data_format,const Tensor * tensor_in,const Tensor * tensor_out,const Tensor & out_backprop,const TensorShape & tensor_in_shape,bool propagate_nans)444 void DnnPoolingGradOp<T>::Compute(
445 OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
446 const std::vector<int32>& size, const std::vector<int32>& stride,
447 Padding padding, std::vector<int64> explicit_paddings,
448 TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
449 const Tensor& out_backprop, const TensorShape& tensor_in_shape,
450 bool propagate_nans) {
451 CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
452 (tensor_in && tensor_out))
453 << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
454 "specified";
455
456 Tensor* input_backprop = nullptr;
457 OP_REQUIRES_OK(context,
458 context->allocate_output(0, tensor_in_shape, &input_backprop));
459 if (tensor_in_shape.num_elements() == 0) {
460 return;
461 }
462
463 PoolParameters params{context, size, stride, padding,
464 explicit_paddings, data_format, tensor_in_shape};
465 if (!context->status().ok()) {
466 return;
467 }
468
469 TensorFormat transformed_input_data_format = data_format;
470
471 #if CUDNN_VERSION < 7300
472 /// For now, cudnn does not support NHWC format, so we need to convert it
473 /// to NCHW before calling cudnn. We need to get rid of this once it is done
474 Tensor transformed_input;
475 TensorShape transformed_input_shape;
476 if (data_format == FORMAT_NHWC || !tensor_in) {
477 transformed_input_shape =
478 ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
479 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
480 transformed_input_shape,
481 &transformed_input));
482 } else {
483 transformed_input = *tensor_in;
484 }
485 Tensor transformed_output;
486 TensorShape transformed_output_shape;
487 if (data_format == FORMAT_NHWC || !tensor_out) {
488 transformed_output_shape =
489 ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
490 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
491 transformed_output_shape,
492 &transformed_output));
493 } else {
494 transformed_output = *tensor_out;
495 }
496 Tensor transformed_input_backprop;
497 if (data_format == FORMAT_NHWC) {
498 OP_REQUIRES_OK(context,
499 context->allocate_temp(DataTypeToEnum<T>::value,
500 transformed_input_shape,
501 &transformed_input_backprop));
502 } else {
503 transformed_input_backprop = *input_backprop;
504 }
505 Tensor transformed_output_backprop;
506 if (data_format == FORMAT_NHWC) {
507 OP_REQUIRES_OK(context,
508 context->allocate_temp(DataTypeToEnum<T>::value,
509 transformed_output_shape,
510 &transformed_output_backprop));
511 } else {
512 transformed_output_backprop = out_backprop;
513 }
514
515 if (data_format == FORMAT_NHWC) {
516 /// Convert the data from NHWC to NCHW if necessary.
517 if (tensor_in) {
518 // For AvgPoolGrad, the original input tensor is not necessary. However,
519 // cudnn still requires them to run, although they do not affect the
520 // results.
521 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
522 tensor_in->tensor<T, 4>(),
523 transformed_input.tensor<T, 4>());
524 transformed_input_data_format = FORMAT_NCHW;
525 }
526 if (tensor_out) {
527 // For AvgPoolGrad, the original output tensor is not necessary. However,
528 // cudnn still requires them to run, although they do not affect the
529 // results.
530 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
531 tensor_out->tensor<T, 4>(),
532 transformed_output.tensor<T, 4>());
533 }
534 functor::NHWCToNCHW<GPUDevice, T, 4>()(
535 context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
536 transformed_output_backprop.tensor<T, 4>());
537 }
538 se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
539 #else
540 Tensor transformed_input;
541 if (!tensor_in) {
542 OP_REQUIRES_OK(context,
543 context->allocate_temp(DataTypeToEnum<T>::value,
544 tensor_in_shape, &transformed_input));
545 } else {
546 transformed_input = *tensor_in;
547 }
548 Tensor transformed_output;
549 if (!tensor_out) {
550 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
551 out_backprop.shape(),
552 &transformed_output));
553 } else {
554 transformed_output = *tensor_out;
555 }
556 Tensor transformed_input_backprop = *input_backprop;
557 Tensor transformed_output_backprop = out_backprop;
558 se::dnn::DataLayout data_layout;
559 switch (data_format) {
560 case FORMAT_NHWC:
561 data_layout = se::dnn::DataLayout::kBatchYXDepth;
562 break;
563 case FORMAT_NCHW:
564 data_layout = se::dnn::DataLayout::kBatchDepthYX;
565 break;
566 default:
567 OP_REQUIRES(context, false,
568 errors::InvalidArgument("Unsupported format: ",
569 ToString(data_format)));
570 }
571 #endif // CUDNN_VERSION < 7300
572
573 int64_t vertical_padding = params.pad_top;
574 int64_t horizontal_padding = params.pad_left;
575
576 int batch_size = params.tensor_in_batch;
577 int depth = params.depth;
578 int tensor_in_cols = params.tensor_in_cols;
579 int tensor_in_rows = params.tensor_in_rows;
580
581 int64_t input_pad_top = 0;
582 int64_t input_pad_bottom = 0;
583 int64_t input_pad_left = 0;
584 int64_t input_pad_right = 0;
585
586 Tensor transformed_and_padded_input_backprop;
587
588 if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
589 params.pad_left != params.pad_right)) {
590 // Pad the input in the same way we did during the forward pass, so that
591 // cuDNN or MIOpen receives the same input during the backward pass function
592 // as it did during the forward pass function.
593 const int64_t common_padding_rows =
594 std::min(params.pad_top, params.pad_bottom);
595 const int64_t common_padding_cols =
596 std::min(params.pad_left, params.pad_right);
597
598 Tensor padded_input;
599 const int64_t padding_rows_diff =
600 std::abs(params.pad_top - params.pad_bottom);
601 const int64_t padding_cols_diff =
602 std::abs(params.pad_left - params.pad_right);
603
604 const int64_t new_in_rows = tensor_in_rows + padding_rows_diff;
605 const int64_t new_in_cols = tensor_in_cols + padding_cols_diff;
606
607 VLOG(2) << "Create new tensor: "
608 << " original rows=" << tensor_in_rows
609 << " original cols=" << tensor_in_cols
610 << " padding_rows=" << new_in_rows
611 << " padding_cols=" << new_in_cols << " depth= " << depth
612 << " batch_size=" << batch_size << " kernel_rows"
613 << params.window_rows << " kernel_col" << params.window_cols
614 << " stride_rows" << params.row_stride;
615
616 OP_REQUIRES_OK(
617 context, context->allocate_temp(
618 DataTypeToEnum<T>::value,
619 ShapeFromFormat(transformed_input_data_format, batch_size,
620 new_in_rows, new_in_cols, depth),
621 &padded_input));
622
623 OP_REQUIRES_OK(
624 context, context->allocate_temp(
625 DataTypeToEnum<T>::value,
626 ShapeFromFormat(transformed_input_data_format, batch_size,
627 new_in_rows, new_in_cols, depth),
628 &transformed_and_padded_input_backprop));
629
630 input_pad_top = params.pad_top - common_padding_rows;
631 input_pad_bottom = params.pad_bottom - common_padding_rows;
632 input_pad_left = params.pad_left - common_padding_cols;
633 input_pad_right = params.pad_right - common_padding_cols;
634
635 bool in_bounds =
636 FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
637 FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
638 FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
639 FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
640 if (!in_bounds) {
641 context->SetStatus(errors::InvalidArgument("Padding is too large."));
642 return;
643 }
644
645 // PadInputWithNegativeInf functor requires input to be a const.
646 const Tensor& const_transformed_input = transformed_input;
647 OP_REQUIRES_OK(
648 context,
649 PadInputWithNegativeInf<T>()(
650 context->eigen_device<GPUDevice>(),
651 To32Bit(const_transformed_input.tensor<T, 4>()),
652 static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
653 static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
654 To32Bit(padded_input.tensor<T, 4>()),
655 transformed_input_data_format));
656
657 transformed_input = padded_input;
658
659 vertical_padding = common_padding_rows;
660 horizontal_padding = common_padding_cols;
661 VLOG(2) << "vertical padding set to: " << vertical_padding
662 << " horizontal padding set to: " << horizontal_padding;
663 tensor_in_rows = new_in_rows;
664 tensor_in_cols = new_in_cols;
665 } else {
666 transformed_and_padded_input_backprop = transformed_input_backprop;
667 }
668
669 /// Get ready to call cudnn
670 se::dnn::PoolingDescriptor pooling_desc;
671 pooling_desc.set_pooling_mode(pooling_mode)
672 .set_window_height(params.window_rows)
673 .set_window_width(params.window_cols)
674 .set_vertical_stride(params.row_stride)
675 .set_horizontal_stride(params.col_stride)
676 .set_vertical_padding(vertical_padding)
677 .set_horizontal_padding(horizontal_padding)
678 .set_propagate_nans(propagate_nans);
679
680 se::dnn::BatchDescriptor orig_output_desc;
681 orig_output_desc.set_count(params.tensor_in_batch)
682 .set_height(params.out_height)
683 .set_width(params.out_width)
684 .set_feature_map_count(params.depth)
685 .set_layout(data_layout);
686
687 se::dnn::BatchDescriptor orig_input_desc;
688 orig_input_desc.set_count(params.tensor_in_batch)
689 .set_height(tensor_in_rows)
690 .set_width(tensor_in_cols)
691 .set_feature_map_count(params.depth)
692 .set_layout(data_layout);
693
694 auto orig_output_data =
695 AsDeviceMemory(transformed_output.template flat<T>().data(),
696 transformed_output.template flat<T>().size());
697 auto orig_input_data =
698 AsDeviceMemory(transformed_input.template flat<T>().data(),
699 transformed_input.template flat<T>().size());
700 auto output_backprop_data =
701 AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
702 transformed_output_backprop.template flat<T>().size());
703 auto input_backprop_data = AsDeviceMemory(
704 transformed_and_padded_input_backprop.template flat<T>().data(),
705 transformed_and_padded_input_backprop.template flat<T>().size());
706
707 auto* stream = context->op_device_context()->stream();
708 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
709
710 #if TENSORFLOW_USE_ROCM
711 static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
712 // default value is in bytes despite the name of the environment variable
713 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
714 );
715
716 DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
717 bool status = stream
718 ->ThenPoolBackward(pooling_desc, orig_input_desc,
719 orig_input_data, orig_output_desc,
720 orig_output_data, output_backprop_data,
721 &input_backprop_data, &scratch_allocator)
722 .ok();
723 #else
724 bool status =
725 stream
726 ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
727 orig_output_desc, orig_output_data,
728 output_backprop_data, &input_backprop_data)
729 .ok();
730 #endif
731
732 OP_REQUIRES(context, status,
733 errors::Internal("dnn PoolBackward launch failed"));
734
735 if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
736 params.pad_left != params.pad_right)) {
737 // Remove the padding that was added to the input shape above.
738 functor::PadInput<GPUDevice, T, int, 4>()(
739 context->eigen_device<GPUDevice>(),
740 To32Bit(const_cast<const Tensor&>(transformed_and_padded_input_backprop)
741 .tensor<T, 4>()),
742 {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
743 {{static_cast<int>(-input_pad_bottom),
744 static_cast<int>(-input_pad_right)}},
745 To32Bit(transformed_input_backprop.template tensor<T, 4>()),
746 transformed_input_data_format, T{});
747 }
748
749 #if CUDNN_VERSION < 7300
750 if (data_format == FORMAT_NHWC) {
751 /// Transform the output data from NCHW back to NHWC.
752 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
753 functor::NCHWToNHWC<GPUDevice, T, 4>()(
754 context->eigen_device<Device>(),
755 toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
756 input_backprop->tensor<T, 4>());
757 }
758 #endif // CUDNN_VERSION < 7300
759 }
760
761 #define DEFINE_DNN_OPS(T) \
762 template class DnnPoolingOp<T>; \
763 template class DnnPoolingGradOp<T>;
764 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
765
766 #if CUDNN_VERSION >= 7300
767 template class DnnPoolingOp<qint8>;
768 #endif
769
770 #undef DEFINE_DNN_OPS
771
772 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
773
774 } // namespace tensorflow
775