• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/kernels/pooling_ops_common.h"
17 
18 #include <vector>
19 
20 #include "tensorflow/core/common_runtime/device.h"
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/kernel_shape_util.h"
23 #include "tensorflow/core/framework/register_types.h"
24 #include "tensorflow/core/framework/tensor.h"
25 
26 #if GOOGLE_CUDA
27 #include "third_party/gpus/cudnn/cudnn.h"
28 #endif  // GOOGLE_CUDA
29 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
30 #include "tensorflow/core/kernels/conv_2d.h"
31 #include "tensorflow/core/kernels/gpu_utils.h"
32 #if TENSORFLOW_USE_ROCM
33 #include "tensorflow/core/kernels/conv_ops_gpu.h"
34 #endif
35 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
36 #include "tensorflow/core/platform/stream_executor.h"
37 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38 
39 namespace tensorflow {
40 
41 namespace {
42 
43 template <typename T>
44 struct RawType {
45   using type = T;
46 };
47 
48 template <>
49 struct RawType<qint8> {
50   using type = int8;
51 };
52 
53 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
54 
55 template <typename T>
56 struct PadInputWithNegativeInf {
operator ()tensorflow::__anon5ebeb0010111::PadInputWithNegativeInf57   Status operator()(const GPUDevice& d,
58                     typename TTypes<T, 4, int>::ConstTensor in,
59                     int input_pad_top, int input_pad_bottom, int input_pad_left,
60                     int input_pad_right, typename TTypes<T, 4, int>::Tensor out,
61                     TensorFormat format) {
62     T padding_value = -std::numeric_limits<T>::infinity();
63     functor::PadInput<GPUDevice, T, int, 4>()(
64         d, in, {{input_pad_top, input_pad_left}},
65         {{input_pad_bottom, input_pad_right}}, out, format, padding_value);
66     return Status::OK();
67   }
68 };
69 
70 template <>
71 struct PadInputWithNegativeInf<qint8> {
operator ()tensorflow::__anon5ebeb0010111::PadInputWithNegativeInf72   Status operator()(const GPUDevice& d,
73                     typename TTypes<qint8, 4, int>::ConstTensor in,
74                     int input_pad_top, int input_pad_bottom, int input_pad_left,
75                     int input_pad_right,
76                     typename TTypes<qint8, 4, int>::Tensor out,
77                     TensorFormat format) {
78     return errors::InvalidArgument(
79         "Explicit padding not yet supported with qint8");
80   }
81 };
82 
83 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
84 
85 }  // namespace
86 
CheckPaddingSize(int64_t window_rows,int64_t window_cols,int64_t pad_top,int64_t pad_bottom,int64_t pad_left,int64_t pad_right)87 Status CheckPaddingSize(int64_t window_rows, int64_t window_cols,
88                         int64_t pad_top, int64_t pad_bottom, int64_t pad_left,
89                         int64_t pad_right) {
90   if (!FastBoundsCheck(pad_top, window_rows)) {
91     return errors::InvalidArgument("Top padding ", pad_top,
92                                    " needs to be smaller than the "
93                                    "window size ",
94                                    window_rows);
95   }
96   if (!FastBoundsCheck(pad_bottom, window_rows)) {
97     return errors::InvalidArgument("Bottom padding ", pad_bottom,
98                                    " needs to be smaller than the "
99                                    "window size ",
100                                    window_rows);
101   }
102   if (!FastBoundsCheck(pad_left, window_cols)) {
103     return errors::InvalidArgument("Left padding ", pad_left,
104                                    " needs to be smaller than the "
105                                    "window size ",
106                                    window_cols);
107   }
108   if (!FastBoundsCheck(pad_right, window_cols)) {
109     return errors::InvalidArgument("Right padding ", pad_right,
110                                    " needs to be smaller than the "
111                                    "window size ",
112                                    window_cols);
113   }
114   return Status::OK();
115 }
116 
PoolParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,std::vector<int64> explicit_paddings,TensorFormat data_format,const TensorShape & tensor_in_shape)117 PoolParameters::PoolParameters(OpKernelContext* context,
118                                const std::vector<int32>& ksize,
119                                const std::vector<int32>& stride,
120                                Padding padding,
121                                std::vector<int64> explicit_paddings,
122                                TensorFormat data_format,
123                                const TensorShape& tensor_in_shape) {
124   // For maxpooling, tensor_in should have 2 spatial dimensions.
125   // Note: the total number of dimensions could be 4 for NHWC, NCHW,
126   // or 5 for NCHW_VECT_C.
127   OP_REQUIRES(context,
128               GetTensorSpatialDims(tensor_in_shape.dims(), data_format) == 2,
129               errors::InvalidArgument(
130                   "tensor_in_shape must have 2 spatial dimensions. ",
131                   tensor_in_shape.dims(), " ", data_format));
132 
133   this->data_format = data_format;
134   depth = GetTensorDim(tensor_in_shape, data_format, 'C') *
135           (data_format == FORMAT_NCHW_VECT_C ? 4 : 1);
136   tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
137   tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
138   tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
139   window_rows = GetTensorDim(ksize, data_format, 'H');
140   window_cols = GetTensorDim(ksize, data_format, 'W');
141   depth_window = GetTensorDim(ksize, data_format, 'C');
142   row_stride = GetTensorDim(stride, data_format, 'H');
143   col_stride = GetTensorDim(stride, data_format, 'W');
144   depth_stride = GetTensorDim(stride, data_format, 'C');
145 
146   // We only support 2D pooling across width/height and depthwise
147   // pooling, not a combination.
148   OP_REQUIRES(context,
149               (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
150               errors::Unimplemented(
151                   "MaxPooling supports exactly one of pooling across depth "
152                   "or pooling across width/height."));
153   if (padding == Padding::EXPLICIT) {
154     OP_REQUIRES_OK(context, CheckValidPadding(padding, explicit_paddings,
155                                               /*num_dims=*/4, data_format));
156     GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &pad_top,
157                              &pad_bottom);
158     GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &pad_left,
159                              &pad_right);
160     OP_REQUIRES_OK(context, CheckPaddingSize(window_rows, window_cols, pad_top,
161                                              pad_bottom, pad_left, pad_right));
162   }
163 
164   if (depth_window == 1) {
165     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
166                                 tensor_in_rows, window_rows, row_stride,
167                                 padding, &out_height, &pad_top, &pad_bottom));
168     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
169                                 tensor_in_cols, window_cols, col_stride,
170                                 padding, &out_width, &pad_left, &pad_right));
171     pad_depth = 0;
172     out_depth = depth;
173   } else {
174     OP_REQUIRES(context, depth_window > 0,
175                 errors::InvalidArgument("depth_window must not be 0"));
176     // Our current version of depthwise max pooling does not support
177     // any padding, and expects the depth_window to equal the
178     // depth_stride (no overlapping).
179     OP_REQUIRES(
180         context, depth % depth_window == 0,
181         errors::Unimplemented("Depthwise max pooling requires the depth "
182                               "window to evenly divide the input depth"));
183     OP_REQUIRES(
184         context, depth_stride == depth_window,
185         errors::Unimplemented("Depthwise max pooling requires the depth "
186                               "window to equal the depth stride"));
187 
188     // The current version of depthwise max is only implemented on CPU.
189     OP_REQUIRES(context,
190                 (DeviceType(static_cast<Device*>(context->device())
191                                 ->attributes()
192                                 .device_type()) == DeviceType(DEVICE_CPU)),
193                 errors::Unimplemented("Depthwise max pooling is currently "
194                                       "only implemented for CPU devices."));
195 
196     pad_depth = 0;
197     out_depth = depth / depth_window;
198   }
199 }
200 
forward_output_shape()201 TensorShape PoolParameters::forward_output_shape() {
202   if (depth_window == 1) {
203     // Spatial pooling
204     return ShapeFromFormat(data_format, tensor_in_batch, out_height, out_width,
205                            depth);
206   } else {
207     // Depthwise pooling
208     return TensorShape(
209         {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
210   }
211 }
212 
213 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
214 
215 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,std::vector<int64> explicit_paddings,TensorFormat data_format,const Tensor & tensor_in,const TensorShape & tensor_out_shape,bool propagate_nans)216 void DnnPoolingOp<T>::Compute(OpKernelContext* context,
217                               se::dnn::PoolingMode pooling_mode,
218                               const std::vector<int32>& size,
219                               const std::vector<int32>& stride, Padding padding,
220                               std::vector<int64> explicit_paddings,
221                               TensorFormat data_format, const Tensor& tensor_in,
222                               const TensorShape& tensor_out_shape,
223                               bool propagate_nans) {
224   Tensor* tensor_out = nullptr;
225   OP_REQUIRES_OK(context,
226                  context->allocate_output(0, tensor_out_shape, &tensor_out));
227   if (tensor_in.shape().num_elements() == 0) {
228     return;
229   }
230 
231   PoolParameters params{
232       context,           size,        stride,           padding,
233       explicit_paddings, data_format, tensor_in.shape()};
234   if (!context->status().ok()) {
235     return;
236   }
237 
238   int batch_size = params.tensor_in_batch;
239   int depth = params.depth;
240   int tensor_in_cols = params.tensor_in_cols;
241   int tensor_in_rows = params.tensor_in_rows;
242 
243 #if CUDNN_VERSION < 7300
244   /// Earlier versions do not support NHWC format, so we need to convert it
245   /// to NCHW before calling cudnn. We need to get rid of this once it is done
246   Tensor transformed_input;
247   if (data_format == FORMAT_NHWC) {
248     OP_REQUIRES_OK(context, context->allocate_temp(
249                                 DataTypeToEnum<T>::value,
250                                 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
251                                                 data_format),
252                                 &transformed_input));
253     functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
254                                            tensor_in.tensor<T, 4>(),
255                                            transformed_input.tensor<T, 4>());
256   } else {
257     transformed_input = tensor_in;
258   }
259   Tensor transformed_output;
260   if (data_format == FORMAT_NHWC) {
261     OP_REQUIRES_OK(context, context->allocate_temp(
262                                 DataTypeToEnum<T>::value,
263                                 ShapeFromFormat(FORMAT_NCHW, tensor_out_shape,
264                                                 data_format),
265                                 &transformed_output));
266   } else {
267     transformed_output = *tensor_out;
268   }
269   se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
270 #else
271   Tensor transformed_input = tensor_in;
272   auto& transformed_output = *tensor_out;
273   se::dnn::DataLayout data_layout;
274   switch (data_format) {
275     case FORMAT_NHWC:
276       data_layout = se::dnn::DataLayout::kBatchYXDepth;
277       break;
278     case FORMAT_NCHW:
279       data_layout = se::dnn::DataLayout::kBatchDepthYX;
280       break;
281     case FORMAT_NCHW_VECT_C:
282       // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
283       // emulated via NHWC.
284       data_layout = se::dnn::DataLayout::kBatchYXDepth;
285       batch_size *= depth / 4;
286       depth = 4;
287       break;
288     default:
289       OP_REQUIRES(context, false,
290                   errors::InvalidArgument("Unsupported format: ",
291                                           ToString(data_format)));
292   }
293 #endif
294 
295   int64_t vertical_padding = params.pad_top;
296   int64_t horizontal_padding = params.pad_left;
297 
298   if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
299                               params.pad_left != params.pad_right)) {
300     // cuDNN only supports padding the same amount on the left and right sides,
301     // and on the top and bottom sides. So we manually create a new padded
302     // input tensor such that we can pass it to cuDNN.
303     const int64_t common_padding_rows =
304         std::min(params.pad_top, params.pad_bottom);
305     const int64_t common_padding_cols =
306         std::min(params.pad_left, params.pad_right);
307 
308     Tensor padded_input;
309     const int64_t padding_rows_diff =
310         std::abs(params.pad_top - params.pad_bottom);
311     const int64_t padding_cols_diff =
312         std::abs(params.pad_left - params.pad_right);
313 
314     const int64_t new_in_rows = tensor_in_rows + padding_rows_diff;
315     const int64_t new_in_cols = tensor_in_cols + padding_cols_diff;
316 
317     OP_REQUIRES_OK(
318         context,
319         context->allocate_temp(DataTypeToEnum<T>::value,
320                                ShapeFromFormat(data_format, batch_size,
321                                                new_in_rows, new_in_cols, depth),
322                                &padded_input));
323     const int64_t input_pad_top = params.pad_top - common_padding_rows;
324     const int64_t input_pad_bottom = params.pad_bottom - common_padding_rows;
325     const int64_t input_pad_left = params.pad_left - common_padding_cols;
326     const int64_t input_pad_right = params.pad_right - common_padding_cols;
327 
328     bool in_bounds =
329         FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
330         FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
331         FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
332         FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
333     if (!in_bounds) {
334       context->SetStatus(errors::InvalidArgument("Padding is too large."));
335       return;
336     }
337 
338     // We need to call the const version of transformed_input.tensor()
339     const Tensor& const_transformed_input = transformed_input;
340     OP_REQUIRES_OK(
341         context,
342         PadInputWithNegativeInf<T>()(
343             context->eigen_device<GPUDevice>(),
344             To32Bit(const_transformed_input.tensor<T, 4>()),
345             static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
346             static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
347             To32Bit(padded_input.tensor<T, 4>()), data_format));
348     transformed_input = padded_input;
349     vertical_padding = common_padding_rows;
350     horizontal_padding = common_padding_cols;
351     tensor_in_rows = new_in_rows;
352     tensor_in_cols = new_in_cols;
353   }
354 
355   se::dnn::PoolingDescriptor pooling_desc;
356   pooling_desc.set_pooling_mode(pooling_mode)
357       .set_window_height(params.window_rows)
358       .set_window_width(params.window_cols)
359       .set_vertical_stride(params.row_stride)
360       .set_horizontal_stride(params.col_stride)
361       .set_vertical_padding(vertical_padding)
362       .set_horizontal_padding(horizontal_padding)
363       .set_propagate_nans(propagate_nans);
364 
365   se::dnn::BatchDescriptor input_desc;
366   input_desc.set_count(batch_size)
367       .set_height(tensor_in_rows)
368       .set_width(tensor_in_cols)
369       .set_feature_map_count(depth)
370       .set_layout(data_layout);
371 
372   se::dnn::BatchDescriptor output_desc;
373   output_desc.set_count(batch_size)
374       .set_height(params.out_height)
375       .set_width(params.out_width)
376       .set_feature_map_count(depth)
377       .set_layout(data_layout);
378 
379   auto input_data =
380       AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
381                          transformed_input.template flat<T>().data()),
382                      transformed_input.template flat<T>().size());
383 
384   auto output_data =
385       AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
386                          transformed_output.template flat<T>().data()),
387                      transformed_output.template flat<T>().size());
388 
389   auto* stream = context->op_device_context()->stream();
390   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
391 
392 #if TENSORFLOW_USE_ROCM
393   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
394       // default value is in bytes despite the name of the environment variable
395       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
396   );
397 
398   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
399   bool status =
400       stream
401           ->ThenPoolForward(pooling_desc, input_desc, input_data, output_desc,
402                             &output_data, &scratch_allocator)
403           .ok();
404 #else
405   bool status = stream
406                     ->ThenPoolForward(pooling_desc, input_desc, input_data,
407                                       output_desc, &output_data)
408                     .ok();
409 #endif
410   OP_REQUIRES(context, status,
411               errors::Internal("dnn PoolForward launch failed"));
412 #if CUDNN_VERSION < 7300
413   if (data_format == FORMAT_NHWC) {
414     /// Transform the output data from NCHW back to NHWC
415     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
416     using RT = typename RawType<T>::type;
417     functor::NCHWToNHWC<GPUDevice, RT, 4>()(
418         context->eigen_device<Device>(),
419         toConstTensor(transformed_output).template tensor<RT, 4>(),
420         tensor_out->tensor<RT, 4>());
421   }
422 #endif
423 }
424 
425 // Forward declarations of the functor specializations for GPU.
426 namespace functor {
427 #define DECLARE_GPU_SPEC(T)                                             \
428   template <>                                                           \
429   void PadInput<GPUDevice, T, int, 4>::operator()(                      \
430       const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
431       const std::array<int, 2>& padding_left,                           \
432       const std::array<int, 2>& padding_right,                          \
433       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
434       const T& padding_value);                                          \
435   extern template struct PadInput<GPUDevice, T, int, 4>;
436 
437 DECLARE_GPU_SPEC(float);
438 DECLARE_GPU_SPEC(Eigen::half);
439 DECLARE_GPU_SPEC(double);
440 DECLARE_GPU_SPEC(int32);
441 }  // namespace functor
442 
443 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,std::vector<int64> explicit_paddings,TensorFormat data_format,const Tensor * tensor_in,const Tensor * tensor_out,const Tensor & out_backprop,const TensorShape & tensor_in_shape,bool propagate_nans)444 void DnnPoolingGradOp<T>::Compute(
445     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
446     const std::vector<int32>& size, const std::vector<int32>& stride,
447     Padding padding, std::vector<int64> explicit_paddings,
448     TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
449     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
450     bool propagate_nans) {
451   CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
452         (tensor_in && tensor_out))
453       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
454          "specified";
455 
456   Tensor* input_backprop = nullptr;
457   OP_REQUIRES_OK(context,
458                  context->allocate_output(0, tensor_in_shape, &input_backprop));
459   if (tensor_in_shape.num_elements() == 0) {
460     return;
461   }
462 
463   PoolParameters params{context,           size,        stride,         padding,
464                         explicit_paddings, data_format, tensor_in_shape};
465   if (!context->status().ok()) {
466     return;
467   }
468 
469   TensorFormat transformed_input_data_format = data_format;
470 
471 #if CUDNN_VERSION < 7300
472   /// For now, cudnn does not support NHWC format, so we need to convert it
473   /// to NCHW before calling cudnn. We need to get rid of this once it is done
474   Tensor transformed_input;
475   TensorShape transformed_input_shape;
476   if (data_format == FORMAT_NHWC || !tensor_in) {
477     transformed_input_shape =
478         ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
479     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
480                                                    transformed_input_shape,
481                                                    &transformed_input));
482   } else {
483     transformed_input = *tensor_in;
484   }
485   Tensor transformed_output;
486   TensorShape transformed_output_shape;
487   if (data_format == FORMAT_NHWC || !tensor_out) {
488     transformed_output_shape =
489         ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
490     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
491                                                    transformed_output_shape,
492                                                    &transformed_output));
493   } else {
494     transformed_output = *tensor_out;
495   }
496   Tensor transformed_input_backprop;
497   if (data_format == FORMAT_NHWC) {
498     OP_REQUIRES_OK(context,
499                    context->allocate_temp(DataTypeToEnum<T>::value,
500                                           transformed_input_shape,
501                                           &transformed_input_backprop));
502   } else {
503     transformed_input_backprop = *input_backprop;
504   }
505   Tensor transformed_output_backprop;
506   if (data_format == FORMAT_NHWC) {
507     OP_REQUIRES_OK(context,
508                    context->allocate_temp(DataTypeToEnum<T>::value,
509                                           transformed_output_shape,
510                                           &transformed_output_backprop));
511   } else {
512     transformed_output_backprop = out_backprop;
513   }
514 
515   if (data_format == FORMAT_NHWC) {
516     /// Convert the data from NHWC to NCHW if necessary.
517     if (tensor_in) {
518       // For AvgPoolGrad, the original input tensor is not necessary. However,
519       // cudnn still requires them to run, although they do not affect the
520       // results.
521       functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
522                                              tensor_in->tensor<T, 4>(),
523                                              transformed_input.tensor<T, 4>());
524       transformed_input_data_format = FORMAT_NCHW;
525     }
526     if (tensor_out) {
527       // For AvgPoolGrad, the original output tensor is not necessary. However,
528       // cudnn still requires them to run, although they do not affect the
529       // results.
530       functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
531                                              tensor_out->tensor<T, 4>(),
532                                              transformed_output.tensor<T, 4>());
533     }
534     functor::NHWCToNCHW<GPUDevice, T, 4>()(
535         context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
536         transformed_output_backprop.tensor<T, 4>());
537   }
538   se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
539 #else
540   Tensor transformed_input;
541   if (!tensor_in) {
542     OP_REQUIRES_OK(context,
543                    context->allocate_temp(DataTypeToEnum<T>::value,
544                                           tensor_in_shape, &transformed_input));
545   } else {
546     transformed_input = *tensor_in;
547   }
548   Tensor transformed_output;
549   if (!tensor_out) {
550     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
551                                                    out_backprop.shape(),
552                                                    &transformed_output));
553   } else {
554     transformed_output = *tensor_out;
555   }
556   Tensor transformed_input_backprop = *input_backprop;
557   Tensor transformed_output_backprop = out_backprop;
558   se::dnn::DataLayout data_layout;
559   switch (data_format) {
560     case FORMAT_NHWC:
561       data_layout = se::dnn::DataLayout::kBatchYXDepth;
562       break;
563     case FORMAT_NCHW:
564       data_layout = se::dnn::DataLayout::kBatchDepthYX;
565       break;
566     default:
567       OP_REQUIRES(context, false,
568                   errors::InvalidArgument("Unsupported format: ",
569                                           ToString(data_format)));
570   }
571 #endif  // CUDNN_VERSION < 7300
572 
573   int64_t vertical_padding = params.pad_top;
574   int64_t horizontal_padding = params.pad_left;
575 
576   int batch_size = params.tensor_in_batch;
577   int depth = params.depth;
578   int tensor_in_cols = params.tensor_in_cols;
579   int tensor_in_rows = params.tensor_in_rows;
580 
581   int64_t input_pad_top = 0;
582   int64_t input_pad_bottom = 0;
583   int64_t input_pad_left = 0;
584   int64_t input_pad_right = 0;
585 
586   Tensor transformed_and_padded_input_backprop;
587 
588   if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
589                               params.pad_left != params.pad_right)) {
590     // Pad the input in the same way we did during the forward pass, so that
591     // cuDNN or MIOpen receives the same input during the backward pass function
592     // as it did during the forward pass function.
593     const int64_t common_padding_rows =
594         std::min(params.pad_top, params.pad_bottom);
595     const int64_t common_padding_cols =
596         std::min(params.pad_left, params.pad_right);
597 
598     Tensor padded_input;
599     const int64_t padding_rows_diff =
600         std::abs(params.pad_top - params.pad_bottom);
601     const int64_t padding_cols_diff =
602         std::abs(params.pad_left - params.pad_right);
603 
604     const int64_t new_in_rows = tensor_in_rows + padding_rows_diff;
605     const int64_t new_in_cols = tensor_in_cols + padding_cols_diff;
606 
607     VLOG(2) << "Create new tensor: "
608             << " original rows=" << tensor_in_rows
609             << " original cols=" << tensor_in_cols
610             << " padding_rows=" << new_in_rows
611             << " padding_cols=" << new_in_cols << " depth= " << depth
612             << " batch_size=" << batch_size << " kernel_rows"
613             << params.window_rows << " kernel_col" << params.window_cols
614             << " stride_rows" << params.row_stride;
615 
616     OP_REQUIRES_OK(
617         context, context->allocate_temp(
618                      DataTypeToEnum<T>::value,
619                      ShapeFromFormat(transformed_input_data_format, batch_size,
620                                      new_in_rows, new_in_cols, depth),
621                      &padded_input));
622 
623     OP_REQUIRES_OK(
624         context, context->allocate_temp(
625                      DataTypeToEnum<T>::value,
626                      ShapeFromFormat(transformed_input_data_format, batch_size,
627                                      new_in_rows, new_in_cols, depth),
628                      &transformed_and_padded_input_backprop));
629 
630     input_pad_top = params.pad_top - common_padding_rows;
631     input_pad_bottom = params.pad_bottom - common_padding_rows;
632     input_pad_left = params.pad_left - common_padding_cols;
633     input_pad_right = params.pad_right - common_padding_cols;
634 
635     bool in_bounds =
636         FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
637         FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
638         FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
639         FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
640     if (!in_bounds) {
641       context->SetStatus(errors::InvalidArgument("Padding is too large."));
642       return;
643     }
644 
645     // PadInputWithNegativeInf functor requires input to be a const.
646     const Tensor& const_transformed_input = transformed_input;
647     OP_REQUIRES_OK(
648         context,
649         PadInputWithNegativeInf<T>()(
650             context->eigen_device<GPUDevice>(),
651             To32Bit(const_transformed_input.tensor<T, 4>()),
652             static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
653             static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
654             To32Bit(padded_input.tensor<T, 4>()),
655             transformed_input_data_format));
656 
657     transformed_input = padded_input;
658 
659     vertical_padding = common_padding_rows;
660     horizontal_padding = common_padding_cols;
661     VLOG(2) << "vertical padding set to: " << vertical_padding
662             << " horizontal padding set to: " << horizontal_padding;
663     tensor_in_rows = new_in_rows;
664     tensor_in_cols = new_in_cols;
665   } else {
666     transformed_and_padded_input_backprop = transformed_input_backprop;
667   }
668 
669   /// Get ready to call cudnn
670   se::dnn::PoolingDescriptor pooling_desc;
671   pooling_desc.set_pooling_mode(pooling_mode)
672       .set_window_height(params.window_rows)
673       .set_window_width(params.window_cols)
674       .set_vertical_stride(params.row_stride)
675       .set_horizontal_stride(params.col_stride)
676       .set_vertical_padding(vertical_padding)
677       .set_horizontal_padding(horizontal_padding)
678       .set_propagate_nans(propagate_nans);
679 
680   se::dnn::BatchDescriptor orig_output_desc;
681   orig_output_desc.set_count(params.tensor_in_batch)
682       .set_height(params.out_height)
683       .set_width(params.out_width)
684       .set_feature_map_count(params.depth)
685       .set_layout(data_layout);
686 
687   se::dnn::BatchDescriptor orig_input_desc;
688   orig_input_desc.set_count(params.tensor_in_batch)
689       .set_height(tensor_in_rows)
690       .set_width(tensor_in_cols)
691       .set_feature_map_count(params.depth)
692       .set_layout(data_layout);
693 
694   auto orig_output_data =
695       AsDeviceMemory(transformed_output.template flat<T>().data(),
696                      transformed_output.template flat<T>().size());
697   auto orig_input_data =
698       AsDeviceMemory(transformed_input.template flat<T>().data(),
699                      transformed_input.template flat<T>().size());
700   auto output_backprop_data =
701       AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
702                      transformed_output_backprop.template flat<T>().size());
703   auto input_backprop_data = AsDeviceMemory(
704       transformed_and_padded_input_backprop.template flat<T>().data(),
705       transformed_and_padded_input_backprop.template flat<T>().size());
706 
707   auto* stream = context->op_device_context()->stream();
708   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
709 
710 #if TENSORFLOW_USE_ROCM
711   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
712       // default value is in bytes despite the name of the environment variable
713       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
714   );
715 
716   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
717   bool status = stream
718                     ->ThenPoolBackward(pooling_desc, orig_input_desc,
719                                        orig_input_data, orig_output_desc,
720                                        orig_output_data, output_backprop_data,
721                                        &input_backprop_data, &scratch_allocator)
722                     .ok();
723 #else
724   bool status =
725       stream
726           ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
727                              orig_output_desc, orig_output_data,
728                              output_backprop_data, &input_backprop_data)
729           .ok();
730 #endif
731 
732   OP_REQUIRES(context, status,
733               errors::Internal("dnn PoolBackward launch failed"));
734 
735   if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
736                               params.pad_left != params.pad_right)) {
737     // Remove the padding that was added to the input shape above.
738     functor::PadInput<GPUDevice, T, int, 4>()(
739         context->eigen_device<GPUDevice>(),
740         To32Bit(const_cast<const Tensor&>(transformed_and_padded_input_backprop)
741                     .tensor<T, 4>()),
742         {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
743         {{static_cast<int>(-input_pad_bottom),
744           static_cast<int>(-input_pad_right)}},
745         To32Bit(transformed_input_backprop.template tensor<T, 4>()),
746         transformed_input_data_format, T{});
747   }
748 
749 #if CUDNN_VERSION < 7300
750   if (data_format == FORMAT_NHWC) {
751     /// Transform the output data from NCHW back to NHWC.
752     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
753     functor::NCHWToNHWC<GPUDevice, T, 4>()(
754         context->eigen_device<Device>(),
755         toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
756         input_backprop->tensor<T, 4>());
757   }
758 #endif  // CUDNN_VERSION < 7300
759 }
760 
761 #define DEFINE_DNN_OPS(T)         \
762   template class DnnPoolingOp<T>; \
763   template class DnnPoolingGradOp<T>;
764 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
765 
766 #if CUDNN_VERSION >= 7300
767 template class DnnPoolingOp<qint8>;
768 #endif
769 
770 #undef DEFINE_DNN_OPS
771 
772 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
773 
774 }  // namespace tensorflow
775