• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/nn_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #include "tensorflow/core/kernels/maxpooling_op.h"
21 
22 #include <type_traits>
23 #include <vector>
24 
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/common_runtime/device.h"
27 #include "tensorflow/core/framework/bounds_check.h"
28 #include "tensorflow/core/framework/numeric_op.h"
29 #include "tensorflow/core/framework/op_kernel.h"
30 #include "tensorflow/core/framework/register_types.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_shape.h"
33 #include "tensorflow/core/framework/tensor_slice.h"
34 #include "tensorflow/core/kernels/conv_2d.h"
35 #include "tensorflow/core/kernels/eigen_pooling.h"
36 #include "tensorflow/core/kernels/ops_util.h"
37 #include "tensorflow/core/kernels/pooling_ops_common.h"
38 #include "tensorflow/core/lib/core/errors.h"
39 #include "tensorflow/core/lib/gtl/array_slice.h"
40 #include "tensorflow/core/util/determinism.h"
41 #include "tensorflow/core/util/env_var.h"
42 #include "tensorflow/core/util/padding.h"
43 #include "tensorflow/core/util/tensor_format.h"
44 #include "tensorflow/core/util/use_cudnn.h"
45 
46 #if GOOGLE_CUDA
47 #include "third_party/gpus/cudnn/cudnn.h"
48 #endif  // GOOGLE_CUDA
49 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
50 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
51 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
52 #include "tensorflow/core/platform/stream_executor.h"
53 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
54 
55 namespace tensorflow {
56 
57 typedef Eigen::ThreadPoolDevice CPUDevice;
58 typedef Eigen::GpuDevice GPUDevice;
59 
60 const int kInvalidMaxPoolingIndex = -1;
61 
62 template <typename Device, typename T, typename Targmax>
SpatialMaxPoolWithArgMaxHelper(OpKernelContext * context,Tensor * output,Tensor * output_arg_max,Tensor * input_backprop,const Tensor & tensor_in,const Tensor & out_backprop,const PoolParameters & params,const bool include_batch_in_index)63 static void SpatialMaxPoolWithArgMaxHelper(
64     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
65     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
66     const PoolParameters& params, const bool include_batch_in_index) {
67   if (input_backprop != nullptr) {
68     OP_REQUIRES(
69         context, include_batch_in_index,
70         errors::Internal(
71             "SpatialMaxPoolWithArgMaxHelper requires include_batch_in_index "
72             "to be True when input_backprop != nullptr"));
73     OP_REQUIRES(
74         context, (std::is_same<Targmax, int64>::value),
75         errors::Internal("SpatialMaxPoolWithArgMaxHelper requires Targmax "
76                          "to be int64 when input_backprop != nullptr"));
77   }
78   if (tensor_in.NumElements() == 0 || output->NumElements() == 0) return;
79 
80   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
81       ConstEigenMatrixMap;
82   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
83       EigenMatrixMap;
84   typedef Eigen::Map<Eigen::Matrix<Targmax, Eigen::Dynamic, Eigen::Dynamic>>
85       EigenIndexMatrixMap;
86 
87   ConstEigenMatrixMap in_mat(
88       tensor_in.flat<T>().data(), params.depth,
89       params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
90   EigenMatrixMap out_mat(
91       output->flat<T>().data(), params.depth,
92       params.out_width * params.out_height * params.tensor_in_batch);
93   EigenIndexMatrixMap out_arg_max_mat(
94       output_arg_max->flat<Targmax>().data(), params.depth,
95       params.out_width * params.out_height * params.tensor_in_batch);
96 
97   const DeviceBase::CpuWorkerThreads& worker_threads =
98       *(context->device()->tensorflow_cpu_worker_threads());
99 
100   // The following code basically does the following:
101   // 1. Flattens the input and output tensors into two dimensional arrays.
102   //    tensor_in_as_matrix:
103   //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
104   //    output_as_matrix:
105   //      depth by (out_width * out_height * tensor_in_batch)
106   //
107   // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
108   //    and updates the corresponding column(s) in output_as_matrix with the
109   //    max value.
110   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
111                 &output_arg_max, &out_backprop,
112                 include_batch_in_index](int64_t start, int64_t limit) {
113     const int32_t depth = params.depth;
114     const int32_t in_rows = params.tensor_in_rows;
115     const int32_t in_cols = params.tensor_in_cols;
116     const int32_t pad_top = params.pad_top;
117     const int32_t pad_left = params.pad_left;
118     const int32_t window_rows = params.window_rows;
119     const int32_t window_cols = params.window_cols;
120     const int32_t row_stride = params.row_stride;
121     const int32_t col_stride = params.col_stride;
122     const int32_t out_height = params.out_height;
123     const int32_t out_width = params.out_width;
124 
125     {
126       // Initializes the output tensor with MIN<T>.
127       const int32_t output_image_size = out_height * out_width * depth;
128       EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1,
129                                (limit - start) * output_image_size);
130       out_shard.setConstant(Eigen::NumTraits<T>::lowest());
131       EigenIndexMatrixMap out_arg_max_shard(
132           out_arg_max_mat.data() + start * output_image_size, 1,
133           (limit - start) * output_image_size);
134       out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
135     }
136 
137     for (int64_t b = start; b < limit; ++b) {
138       for (int h = 0; h < in_rows; ++h) {
139         for (int w = 0; w < in_cols; ++w) {
140           // (h_start, h_end) * (w_start, w_end) is the range that the input
141           // vector projects to.
142           const int hpad = h + pad_top;
143           const int wpad = w + pad_left;
144           const int h_start =
145               (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
146           const int h_end = std::min(hpad / row_stride + 1, out_height);
147           const int w_start =
148               (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
149           const int w_end = std::min(wpad / col_stride + 1, out_width);
150           // compute elementwise max
151           const int64_t in_index = (b * in_rows + h) * in_cols + w;
152           for (int ph = h_start; ph < h_end; ++ph) {
153             const int64_t out_index_base = (b * out_height + ph) * out_width;
154             for (int pw = w_start; pw < w_end; ++pw) {
155               const int64_t out_index = out_index_base + pw;
156               /// NOTES(zhengxq): not using the eigen matrix operation for
157               /// now.
158               for (int d = 0; d < depth; ++d) {
159                 const T& input_ref = in_mat.coeffRef(d, in_index);
160                 T& output_ref = out_mat.coeffRef(d, out_index);
161                 Targmax& out_arg_max_ref =
162                     out_arg_max_mat.coeffRef(d, out_index);
163                 if (output_ref < input_ref ||
164                     out_arg_max_ref == kInvalidMaxPoolingIndex) {
165                   output_ref = input_ref;
166                   if (include_batch_in_index) {
167                     out_arg_max_ref = in_index * depth + d;
168                   } else {
169                     out_arg_max_ref = (h * in_cols + w) * depth + d;
170                   }
171                 }
172               }
173             }
174           }
175         }
176       }
177     }
178 
179     if (input_backprop != nullptr) {
180       auto input_backprop_flat = input_backprop->flat<T>();
181       auto out_arg_max_flat = output_arg_max->flat<int64>();
182       auto out_backprop_flat = out_backprop.flat<T>();
183 
184       // Initialize output to 0.
185       const int64_t in_size = in_rows * in_cols * depth;
186       const int64_t in_start = start * in_size;
187       const int64_t in_end = limit * in_size;
188       EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
189                               in_end - in_start);
190       in_shard.setConstant(T(0));
191 
192       // Backpropagate.
193       const int out_size = out_height * out_width * depth;
194       const int out_start = start * out_size;
195       const int out_end = limit * out_size;
196       for (int index = out_start; index < out_end; ++index) {
197         int input_backprop_index = out_arg_max_flat(index);
198         // Although this check is in the inner loop, it is worth its value
199         // so we don't end up with memory corruptions. Our benchmark shows that
200         // the performance impact is quite small
201         // CHECK(input_backprop_index >= in_start && input_backprop_index <
202         // in_end)
203         FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
204         if (index < out_backprop.NumElements()) {
205           input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
206         }
207       }
208     }
209   };
210 
211   const int64_t shard_cost = params.tensor_in_rows * params.tensor_in_cols *
212                              params.depth * params.window_rows *
213                              params.window_cols;
214   Shard(worker_threads.num_threads, worker_threads.workers,
215         params.tensor_in_batch, shard_cost, shard);
216 }
217 
218 // The operation to compute MaxPool gradients.
219 // It takes three inputs:
220 //   - The original input tensor
221 //   - The original output tensor
222 //   - Backprop tensor for output
223 // It produces one output: backprop tensor for input.
224 template <class Device, class T>
225 class MaxPoolingGradOp : public OpKernel {
226  public:
MaxPoolingGradOp(OpKernelConstruction * context)227   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
228     string data_format;
229     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
230     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
231                 errors::InvalidArgument("Invalid data format"));
232     OP_REQUIRES(
233         context, data_format_ == FORMAT_NHWC,
234         errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
235                                 "on device type ",
236                                 DeviceTypeString(context->device_type())));
237 
238     if (context->num_inputs() == 3) {
239       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
240       OP_REQUIRES(context, ksize_.size() == 4,
241                   errors::InvalidArgument("Sliding window ksize field must "
242                                           "specify 4 dimensions"));
243       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
244       OP_REQUIRES(context, stride_.size() == 4,
245                   errors::InvalidArgument("Sliding window strides field must "
246                                           "specify 4 dimensions"));
247       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
248                   errors::Unimplemented(
249                       "Pooling is not yet supported on the batch dimension."));
250       OP_REQUIRES(
251           context, ksize_[3] == 1 && stride_[3] == 1,
252           errors::Unimplemented(
253               "MaxPoolingGrad is not yet supported on the depth dimension."));
254     }
255 
256     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
257 
258     if (padding_ == Padding::EXPLICIT) {
259       OP_REQUIRES_OK(
260           context, context->GetAttr("explicit_paddings", &explicit_paddings_));
261       OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
262                                                 /*num_dims=*/4, data_format_));
263     }
264   }
265 
Compute(OpKernelContext * context)266   void Compute(OpKernelContext* context) override {
267     const Tensor& tensor_in = context->input(0);
268     const Tensor& tensor_out = context->input(1);
269     const Tensor& out_backprop = context->input(2);
270 
271     // For maxpooling, tensor_in should have 4 dimensions.
272     OP_REQUIRES(context, tensor_in.dims() == 4,
273                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
274     OP_REQUIRES(context, tensor_out.dims() == 4,
275                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
276     // For maxpooling, out_backprop should have 4 dimensions.
277     OP_REQUIRES(context, out_backprop.dims() == 4,
278                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
279 
280     const TensorShape& output_shape = tensor_in.shape();
281 
282     Tensor tensor_out_dup;
283     OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
284                                 {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
285                                 &tensor_out_dup));
286     Tensor tensor_out_arg_max;
287     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
288                                                    tensor_out.shape(),
289                                                    &tensor_out_arg_max));
290     std::vector<int32> ksize = ksize_;
291     std::vector<int32> stride = stride_;
292     if (context->num_inputs() == 5) {
293       const Tensor& tensor_ksize = context->input(3);
294       auto value_ksize = tensor_ksize.flat<int32>();
295       ksize.resize(tensor_ksize.shape().num_elements());
296       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
297 
298       const Tensor& tensor_stride = context->input(4);
299       auto value_stride = tensor_stride.flat<int32>();
300       stride.resize(tensor_stride.shape().num_elements());
301       std::copy_n(&value_stride(0), stride.size(), stride.begin());
302     }
303 
304     OP_REQUIRES(context, ksize.size() == 4,
305                 errors::InvalidArgument("Sliding window ksize field must "
306                                         "specify 4 dimensions"));
307     OP_REQUIRES(context, stride.size() == 4,
308                 errors::InvalidArgument("Sliding window strides field must "
309                                         "specify 4 dimensions"));
310     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
311                 errors::Unimplemented(
312                     "Pooling is not yet supported on the batch dimension."));
313     OP_REQUIRES(
314         context, ksize[3] == 1 && stride[3] == 1,
315         errors::Unimplemented(
316             "MaxPoolingGrad is not yet supported on the depth dimension."));
317 
318     PoolParameters params{context,
319                           ksize,
320                           stride,
321                           padding_,
322                           explicit_paddings_,
323                           FORMAT_NHWC,
324                           tensor_in.shape()};
325     if (!context->status().ok()) {
326       return;
327     }
328 
329     Tensor* output = nullptr;
330     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
331                                 {0}, 0, output_shape, &output));
332 
333     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T, int64>(
334         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
335         out_backprop, params, true);
336   }
337 
338  private:
339   std::vector<int32> ksize_;
340   std::vector<int32> stride_;
341   Padding padding_;
342   std::vector<int64> explicit_paddings_;
343   TensorFormat data_format_;
344 };
345 
346 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
347 
348 template <class T>
349 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
350  public:
351   typedef Eigen::GpuDevice Device;
352 
MaxPoolingGradOp(OpKernelConstruction * context)353   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
354     string data_format;
355     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
356     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
357                 errors::InvalidArgument("Invalid data format"));
358     if (context->num_inputs() == 3) {
359       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
360       OP_REQUIRES(context, ksize_.size() == 4,
361                   errors::InvalidArgument("Sliding window ksize field must "
362                                           "specify 4 dimensions"));
363       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
364       OP_REQUIRES(context, stride_.size() == 4,
365                   errors::InvalidArgument("Sliding window strides field must "
366                                           "specify 4 dimensions"));
367       const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
368       const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
369       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
370                   errors::Unimplemented(
371                       "Pooling is not yet supported on the batch dimension."));
372     }
373     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
374     if (padding_ == Padding::EXPLICIT) {
375       OP_REQUIRES_OK(
376           context, context->GetAttr("explicit_paddings", &explicit_paddings_));
377       OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
378                                                 /*num_dims=*/4, data_format_));
379     }
380     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
381                                    &propagate_nans_));
382   }
383 
Compute(OpKernelContext * context)384   void Compute(OpKernelContext* context) override {
385     const Tensor& tensor_in = context->input(0);
386     const Tensor& tensor_out = context->input(1);
387     const Tensor& out_backprop = context->input(2);
388 
389     // For maxpooling, tensor_in should have 4 dimensions.
390     OP_REQUIRES(context, tensor_in.dims() == 4,
391                 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
392     OP_REQUIRES(context, tensor_out.dims() == 4,
393                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
394     // For maxpooling, out_backprop should have 4 dimensions.
395     OP_REQUIRES(context, out_backprop.dims() == 4,
396                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
397 
398     TensorShape output_shape = tensor_in.shape();
399 
400     std::vector<int32> ksize = ksize_;
401     std::vector<int32> stride = stride_;
402     if (context->num_inputs() == 5) {
403       const Tensor& tensor_ksize = context->input(3);
404       auto value_ksize = tensor_ksize.flat<int32>();
405       ksize.resize(tensor_ksize.shape().num_elements());
406       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
407 
408       const Tensor& tensor_stride = context->input(4);
409       auto value_stride = tensor_stride.flat<int32>();
410       stride.resize(tensor_stride.shape().num_elements());
411       std::copy_n(&value_stride(0), stride.size(), stride.begin());
412     }
413     OP_REQUIRES(context, ksize.size() == 4,
414                 errors::InvalidArgument("Sliding window ksize field must "
415                                         "specify 4 dimensions"));
416     OP_REQUIRES(context, stride.size() == 4,
417                 errors::InvalidArgument("Sliding window strides field must "
418                                         "specify 4 dimensions"));
419     const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N');
420     const int32_t stride_n = GetTensorDim(stride, data_format_, 'N');
421     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
422                 errors::Unimplemented(
423                     "Pooling is not yet supported on the batch dimension."));
424     int64_t pad_top, pad_bottom, pad_left, pad_right;
425     if (padding_ == Padding::EXPLICIT) {
426       GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H',
427                                /*pad_top=*/&pad_top,
428                                /*pad_bottom=*/&pad_bottom);
429       GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W',
430                                /*pad_left=*/&pad_left,
431                                /*pad_right=*/&pad_right);
432     }
433     DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
434                                  stride, padding_, explicit_paddings_,
435                                  data_format_, &tensor_in, &tensor_out,
436                                  out_backprop, output_shape, propagate_nans_);
437   }
438 
439  private:
440   std::vector<int32> ksize_;
441   std::vector<int32> stride_;
442   Padding padding_;
443   std::vector<int64> explicit_paddings_;
444   TensorFormat data_format_;
445   bool propagate_nans_;
446 };
447 
448 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
449 
450 // The operation to compute gradient of MaxPool gradients.
451 // It takes three inputs:
452 //   - The original input tensor
453 //   - The original output tensor
454 //   - Backprop tensor for output gradients
455 // It produces one output: backprop tensor for output gradient.
456 template <class Device, class T>
457 class MaxPoolingGradGradOp : public OpKernel {
458  public:
MaxPoolingGradGradOp(OpKernelConstruction * context)459   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
460       : OpKernel(context) {
461     string data_format;
462     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
463     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
464                 errors::InvalidArgument("Invalid data format"));
465     OP_REQUIRES(
466         context, data_format_ == FORMAT_NHWC,
467         errors::InvalidArgument(
468             "Default MaxPoolingGradGradOp only supports NHWC ",
469             "on device type ", DeviceTypeString(context->device_type())));
470 
471     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
472 
473     if (context->num_inputs() == 3) {
474       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
475       OP_REQUIRES(context, ksize_.size() == 4,
476                   errors::InvalidArgument("Sliding window ksize field must "
477                                           "specify 4 dimensions"));
478       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
479       OP_REQUIRES(context, stride_.size() == 4,
480                   errors::InvalidArgument("Sliding window strides field must "
481                                           "specify 4 dimensions"));
482       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
483                   errors::Unimplemented(
484                       "Pooling is not yet supported on the batch dimension."));
485       OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1,
486                   errors::Unimplemented("MaxPoolingGradGrad is not yet "
487                                         "supported on the depth dimension."));
488     }
489   }
490 
Compute(OpKernelContext * context)491   void Compute(OpKernelContext* context) override {
492     const Tensor& tensor_in = context->input(0);
493     const Tensor& tensor_out = context->input(1);
494     const Tensor& out_grad_backprop = context->input(2);
495 
496     // For maxpooling, tensor_in should have 4 dimensions.
497     OP_REQUIRES(context, tensor_in.dims() == 4,
498                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
499     OP_REQUIRES(context, tensor_out.dims() == 4,
500                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
501     // For maxpooling, out_grad_backprop should have 4 dimensions.
502     OP_REQUIRES(
503         context, out_grad_backprop.dims() == 4,
504         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
505 
506     std::vector<int32> ksize = ksize_;
507     std::vector<int32> stride = stride_;
508     if (context->num_inputs() == 5) {
509       const Tensor& tensor_ksize = context->input(3);
510       auto value_ksize = tensor_ksize.flat<int32>();
511       ksize.resize(tensor_ksize.shape().num_elements());
512       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
513 
514       const Tensor& tensor_stride = context->input(4);
515       auto value_stride = tensor_stride.flat<int32>();
516       stride.resize(tensor_stride.shape().num_elements());
517       std::copy_n(&value_stride(0), stride.size(), stride.begin());
518     }
519 
520     OP_REQUIRES(context, ksize.size() == 4,
521                 errors::InvalidArgument("Sliding window ksize field must "
522                                         "specify 4 dimensions"));
523     OP_REQUIRES(context, stride.size() == 4,
524                 errors::InvalidArgument("Sliding window strides field must "
525                                         "specify 4 dimensions"));
526     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
527                 errors::Unimplemented(
528                     "Pooling is not yet supported on the batch dimension."));
529     OP_REQUIRES(
530         context, ksize[3] == 1 && stride[3] == 1,
531         errors::Unimplemented(
532             "MaxPoolingGrad is not yet supported on the depth dimension."));
533 
534     PoolParameters params{context,
535                           ksize,
536                           stride,
537                           padding_,
538                           /*explicit_paddings=*/{},
539                           FORMAT_NHWC,
540                           tensor_in.shape()};
541     Tensor* output = nullptr;
542     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
543                                 {2}, 0, tensor_out.shape(), &output));
544 
545     SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
546                            out_grad_backprop, params, padding_);
547   }
548 
549  private:
SpatialMaxPoolGradGrad(OpKernelContext * context,Tensor * bottom_diff,const Tensor & tensor_in,const Tensor & tensor_out,const Tensor & top_diff,const PoolParameters & params,const Padding & padding)550   void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
551                               const Tensor& tensor_in, const Tensor& tensor_out,
552                               const Tensor& top_diff,
553                               const PoolParameters& params,
554                               const Padding& padding) {
555     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
556         ConstEigenMatrixMap;
557     typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
558         EigenMatrixMap;
559 
560     ConstEigenMatrixMap in_mat(
561         tensor_in.flat<T>().data(), params.depth,
562         params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
563     ConstEigenMatrixMap out_mat(
564         tensor_out.flat<T>().data(), params.depth,
565         params.out_width * params.out_height * params.tensor_in_batch);
566     ConstEigenMatrixMap top_diff_mat(
567         top_diff.flat<T>().data(), params.depth,
568         params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
569     EigenMatrixMap bottom_diff_mat(
570         bottom_diff->flat<T>().data(), params.depth,
571         params.out_width * params.out_height * params.tensor_in_batch);
572 
573     const DeviceBase::CpuWorkerThreads& worker_threads =
574         *(context->device()->tensorflow_cpu_worker_threads());
575 
576     // The following code basically does the following:
577     // 1. Flattens the input, output, top_diff and bottom_diff tensors into
578     //    two dimensional arrays.
579     //    tensor_in_as_matrix:
580     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
581     //    tensor_out_as_matrix:
582     //      depth by (out_width * out_height * tensor_in_batch)
583     //    top_diff_as_matrix:
584     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
585     //    bottom_diff_as_matrix:
586     //      depth by (out_width * out_height * tensor_in_batch)
587     //
588     // 2. Walks through the set of columns in the flattened
589     //    tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
590     //    and updates the column(s) corresponding to the maximum values in
591     //    tensor_out_as_matrix with the corresponding values in
592     //    top_diff_as_matrix.
593     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
594                      int64_t start, int64_t limit) {
595       const int32_t depth = params.depth;
596       const int32_t in_rows = params.tensor_in_rows;
597       const int32_t in_cols = params.tensor_in_cols;
598       const int32_t pad_top = params.pad_top;
599       const int32_t pad_left = params.pad_left;
600       const int32_t window_rows = params.window_rows;
601       const int32_t window_cols = params.window_cols;
602       const int32_t row_stride = params.row_stride;
603       const int32_t col_stride = params.col_stride;
604       const int32_t out_height = params.out_height;
605       const int32_t out_width = params.out_width;
606 
607       {
608         // Initializes the output grad backprop tensor with 0.
609         const int32_t output_image_size = out_height * out_width * params.depth;
610         EigenMatrixMap bottom_diff_shard(
611             bottom_diff_mat.data() + start * output_image_size, 1,
612             (limit - start) * output_image_size);
613         bottom_diff_shard.setZero();
614       }
615 
616       for (int b = start; b < limit; ++b) {
617         for (int ph = 0; ph < out_height; ++ph) {
618           for (int pw = 0; pw < out_width; ++pw) {
619             // (h_start, h_end) * (w_start, w_end) is the range that the input
620             // vector projects to.
621             int h_start = ph * row_stride - pad_top;
622             const int h_end = std::min(h_start + window_rows, in_rows);
623             int w_start = pw * col_stride - pad_left;
624             const int w_end = std::min(w_start + window_cols, in_cols);
625             h_start = std::max(h_start, 0);
626             w_start = std::max(w_start, 0);
627             const int out_index = (b * out_height + ph) * out_width + pw;
628             // Find value corresponding to the input maximum in top_diff.
629             for (int d = 0; d < depth; ++d) {
630               const T& output_ref = out_mat.coeffRef(d, out_index);
631               bool should_stop = false;
632               for (int h = h_start; h < h_end && !should_stop; ++h) {
633                 for (int w = w_start; w < w_end && !should_stop; ++w) {
634                   const int in_index = (b * in_rows + h) * in_cols + w;
635                   const T& input_ref = in_mat.coeffRef(d, in_index);
636                   if (output_ref == input_ref) {
637                     T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
638                     bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
639                     should_stop = true;
640                   }
641                 }
642               }
643             }
644           }
645         }
646       }
647     };
648 
649     const int64_t shard_cost = params.out_width * params.out_height *
650                                params.depth * params.window_rows *
651                                params.window_cols;
652     Shard(worker_threads.num_threads, worker_threads.workers,
653           params.tensor_in_batch, shard_cost, shard);
654   }
655 
656   std::vector<int32> ksize_;
657   std::vector<int32> stride_;
658   Padding padding_;
659   TensorFormat data_format_;
660 };
661 
662 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
663 
664 template <class T>
665 class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
666  public:
667   typedef Eigen::GpuDevice Device;
668 
MaxPoolingGradGradOp(OpKernelConstruction * context)669   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
670       : OpKernel(context) {
671     string data_format;
672     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
673     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
674                 errors::InvalidArgument("Invalid data format"));
675     if (context->num_inputs() == 3) {
676       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
677       OP_REQUIRES(context, ksize_.size() == 4,
678                   errors::InvalidArgument("Sliding window ksize field must "
679                                           "specify 4 dimensions"));
680       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
681       OP_REQUIRES(context, stride_.size() == 4,
682                   errors::InvalidArgument("Sliding window strides field must "
683                                           "specify 4 dimensions"));
684       const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
685       const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
686       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
687                   errors::Unimplemented(
688                       "Pooling is not yet supported on the batch dimension."));
689     }
690     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
691   }
692 
Compute(OpKernelContext * context)693   void Compute(OpKernelContext* context) override {
694     const Tensor& tensor_in = context->input(0);
695     const Tensor& tensor_out = context->input(1);
696     const Tensor& out_grad_backprop = context->input(2);
697 
698     // For maxpooling, tensor_in should have 4 dimensions.
699     OP_REQUIRES(context, tensor_in.dims() == 4,
700                 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
701     OP_REQUIRES(context, tensor_out.dims() == 4,
702                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
703     // For maxpooling, out_grad_backprop should have 4 dimensions.
704     OP_REQUIRES(
705         context, out_grad_backprop.dims() == 4,
706         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
707 
708     Tensor* output = nullptr;
709     OP_REQUIRES_OK(context,
710                    context->allocate_output(0, tensor_out.shape(), &output));
711 
712     std::vector<int32> ksize = ksize_;
713     std::vector<int32> stride = stride_;
714     if (context->num_inputs() == 5) {
715       const Tensor& tensor_ksize = context->input(3);
716       auto value_ksize = tensor_ksize.flat<int32>();
717       ksize.resize(tensor_ksize.shape().num_elements());
718       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
719 
720       const Tensor& tensor_stride = context->input(4);
721       auto value_stride = tensor_stride.flat<int32>();
722       stride.resize(tensor_stride.shape().num_elements());
723       std::copy_n(&value_stride(0), stride.size(), stride.begin());
724     }
725 
726     OP_REQUIRES(context, ksize.size() == 4,
727                 errors::InvalidArgument("Sliding window ksize field must "
728                                         "specify 4 dimensions"));
729     OP_REQUIRES(context, stride.size() == 4,
730                 errors::InvalidArgument("Sliding window strides field must "
731                                         "specify 4 dimensions"));
732     const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N');
733     const int32_t stride_n = GetTensorDim(stride, data_format_, 'N');
734     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
735                 errors::Unimplemented(
736                     "Pooling is not yet supported on the batch dimension."));
737 
738     PoolParameters params{context,
739                           ksize,
740                           stride,
741                           padding_,
742                           /*explicit_paddings=*/{},
743                           data_format_,
744                           tensor_in.shape()};
745 
746     functor::MaxPoolGradBackwardNoMask<T>()(
747         data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
748         params.tensor_in_batch, params.out_height, params.out_width,
749         params.depth, params.tensor_in_rows, params.tensor_in_cols,
750         params.window_rows, params.window_cols, params.row_stride,
751         params.col_stride, params.pad_top, params.pad_left,
752         out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
753         context->eigen_device<Eigen::GpuDevice>());
754   }
755 
756  private:
757   std::vector<int32> ksize_;
758   std::vector<int32> stride_;
759   Padding padding_;
760   TensorFormat data_format_;
761   bool use_dnn_;
762 };
763 
764 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
765 
766 template <typename Device, typename T>
767 struct LaunchMaxPoolingNoMask;
768 
769 template <typename Device, typename T>
770 class MaxPoolingNoMaskOp : public OpKernel {
771  public:
MaxPoolingNoMaskOp(OpKernelConstruction * context)772   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
773       : OpKernel(context) {
774     string data_format;
775     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
776     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
777                 errors::InvalidArgument("Invalid data format"));
778     OP_REQUIRES(
779         context, data_format_ == FORMAT_NHWC,
780         errors::InvalidArgument(
781             "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
782             DeviceTypeString(context->device_type())));
783     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
784     OP_REQUIRES(context, ksize_.size() == 4,
785                 errors::InvalidArgument("Sliding window ksize field must "
786                                         "specify 4 dimensions"));
787     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
788     OP_REQUIRES(context, stride_.size() == 4,
789                 errors::InvalidArgument("Sliding window stride field must "
790                                         "specify 4 dimensions"));
791     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
792     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
793                 errors::Unimplemented(
794                     "Pooling is not yet supported on the batch dimension."));
795     OP_REQUIRES(
796         context, padding_ != EXPLICIT,
797         errors::Unimplemented(
798             "Explicit padding is not supported for MaxPoolingNoMaskOp."));
799   }
800 
Compute(OpKernelContext * context)801   void Compute(OpKernelContext* context) override {
802     const Tensor& tensor_in = context->input(0);
803 
804     PoolParameters params{context,
805                           ksize_,
806                           stride_,
807                           padding_,
808                           /*explicit_paddings=*/{},
809                           data_format_,
810                           tensor_in.shape()};
811     if (!context->status().ok()) {
812       return;
813     }
814 
815     TensorShape out_shape({params.tensor_in_batch, params.out_height,
816                            params.out_width, params.depth});
817     Tensor* output = nullptr;
818     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
819 
820     LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
821                                               output);
822   }
823 
824  private:
825   std::vector<int32> ksize_;
826   std::vector<int32> stride_;
827   Padding padding_;
828   TensorFormat data_format_;
829 };
830 
831 template <typename Device, typename T>
832 class MaxPoolingNoMaskV2Op : public OpKernel {
833  public:
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)834   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
835       : OpKernel(context) {
836     string data_format;
837     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
838     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
839                 errors::InvalidArgument("Invalid data format"));
840     OP_REQUIRES(
841         context, data_format_ == FORMAT_NHWC,
842         errors::InvalidArgument(
843             "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
844             DeviceTypeString(context->device_type())));
845     if (context->num_inputs() == 1) {
846       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
847       OP_REQUIRES(context, ksize_.size() == 4,
848                   errors::InvalidArgument("Sliding window ksize field must "
849                                           "specify 4 dimensions"));
850       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
851       OP_REQUIRES(context, stride_.size() == 4,
852                   errors::InvalidArgument("Sliding window stride field must "
853                                           "specify 4 dimensions"));
854       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
855                   errors::Unimplemented(
856                       "Pooling is not yet supported on the batch dimension."));
857     }
858     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
859   }
860 
Compute(OpKernelContext * context)861   void Compute(OpKernelContext* context) override {
862     const Tensor& tensor_in = context->input(0);
863 
864     std::vector<int32> ksize = ksize_;
865     std::vector<int32> stride = stride_;
866 
867     if (context->num_inputs() != 1) {
868       const Tensor& tensor_ksize = context->input(1);
869       auto value_ksize = tensor_ksize.flat<int32>();
870       ksize.resize(tensor_ksize.shape().num_elements());
871       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
872 
873       const Tensor& tensor_stride = context->input(2);
874       auto value_stride = tensor_stride.flat<int32>();
875       stride.resize(tensor_stride.shape().num_elements());
876       std::copy_n(&value_stride(0), stride.size(), stride.begin());
877     }
878     OP_REQUIRES(context, ksize.size() == 4,
879                 errors::InvalidArgument("Sliding window ksize field must "
880                                         "specify 4 dimensions"));
881     OP_REQUIRES(context, stride.size() == 4,
882                 errors::InvalidArgument("Sliding window stride field must "
883                                         "specify 4 dimensions"));
884     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
885                 errors::Unimplemented(
886                     "Pooling is not yet supported on the batch dimension."));
887     PoolParameters params{context,
888                           ksize,
889                           stride,
890                           padding_,
891                           /*explicit_paddings=*/{},
892                           data_format_,
893                           tensor_in.shape()};
894     if (!context->status().ok()) {
895       return;
896     }
897 
898     TensorShape out_shape({params.tensor_in_batch, params.out_height,
899                            params.out_width, params.depth});
900     Tensor* output = nullptr;
901     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
902 
903     LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
904                                               output);
905   }
906 
907  private:
908   std::vector<int32> ksize_;
909   std::vector<int32> stride_;
910   Padding padding_;
911   TensorFormat data_format_;
912 };
913 
914 template <typename Device, typename T, typename Targmax>
915 struct LaunchMaxPoolingWithArgmax;
916 
917 template <typename T, typename Targmax>
918 struct LaunchMaxPoolingWithArgmax<CPUDevice, T, Targmax> {
launchtensorflow::LaunchMaxPoolingWithArgmax919   static void launch(OpKernelContext* context, const PoolParameters& params,
920                      const Tensor& input, Tensor* output, Tensor* argmax,
921                      bool propagate_nans, bool include_batch_in_index) {
922     Tensor unused;
923     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T, Targmax>(
924         context, output, argmax, /*input_backprop=*/nullptr, input, unused,
925         params, include_batch_in_index);
926   }
927 };
928 
929 template <typename Device, typename T, typename Targmax>
930 class MaxPoolingWithArgmaxOp : public OpKernel {
931  public:
MaxPoolingWithArgmaxOp(OpKernelConstruction * context)932   explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
933       : OpKernel(context) {
934     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
935     OP_REQUIRES(context, ksize_.size() == 4,
936                 errors::InvalidArgument("Sliding window ksize field must "
937                                         "specify 4 dimensions"));
938     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
939     OP_REQUIRES(context, stride_.size() == 4,
940                 errors::InvalidArgument("Sliding window stride field must "
941                                         "specify 4 dimensions"));
942     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
943     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
944                 errors::Unimplemented(
945                     "Pooling is not yet supported on the batch dimension."));
946     OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
947                                              &include_batch_in_index_));
948     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
949                                    &propagate_nans_));
950   }
951 
Compute(OpKernelContext * context)952   void Compute(OpKernelContext* context) override {
953     const Tensor& tensor_in = context->input(0);
954     OP_REQUIRES(context, tensor_in.dims() == 4,
955                 errors::InvalidArgument("tensor_in must be 4-dimensional (2)"));
956     OP_REQUIRES(context, tensor_in.NumElements() > 0,
957                 errors::InvalidArgument("tensor_in must not be empty (2)"));
958 
959     PoolParameters params{context,
960                           ksize_,
961                           stride_,
962                           padding_,
963                           /*explicit_paddings=*/{},
964                           FORMAT_NHWC,
965                           tensor_in.shape()};
966     if (!context->status().ok()) {
967       return;
968     }
969 
970     TensorShape out_shape({params.tensor_in_batch, params.out_height,
971                            params.out_width, params.depth});
972     Tensor* output = nullptr;
973     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
974     Tensor* argmax = nullptr;
975     OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
976 
977     LaunchMaxPoolingWithArgmax<Device, T, Targmax>::launch(
978         context, params, tensor_in, output, argmax, propagate_nans_,
979         include_batch_in_index_);
980   }
981 
982  private:
983   std::vector<int32> ksize_;
984   std::vector<int32> stride_;
985   Padding padding_;
986   bool propagate_nans_;
987   bool include_batch_in_index_;
988 };
989 
990 template <typename Device, typename T>
991 struct LaunchMaxPoolingGradWithArgmax;
992 
993 template <typename T>
994 struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
995   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
996       EigenMatrixMap;
997 
launchtensorflow::LaunchMaxPoolingGradWithArgmax998   static void launch(OpKernelContext* context, const PoolParameters& params,
999                      const Tensor& grad_in, const Tensor& argmax,
1000                      Tensor* grad_out, const bool include_batch_in_index) {
1001     const DeviceBase::CpuWorkerThreads& worker_threads =
1002         *(context->device()->tensorflow_cpu_worker_threads());
1003 
1004     auto shard = [&grad_in, &argmax, &grad_out, include_batch_in_index](
1005                      int64_t start, int64_t limit) {
1006       const int64_t batch_size =
1007           GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
1008       const int64_t output_size_per_batch =
1009           grad_out->NumElements() / batch_size;
1010       const int64_t input_size_per_batch = grad_in.NumElements() / batch_size;
1011 
1012       {
1013         auto grad_out_flat = grad_out->flat<T>();
1014         auto argmax_flat = argmax.flat<int64>();
1015         auto grad_in_flat = grad_in.flat<T>();
1016 
1017         const int64_t output_start = start * output_size_per_batch;
1018         const int64_t output_end = limit * output_size_per_batch;
1019         EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
1020                                   output_end - output_start);
1021         inputShard.setConstant(T(0));
1022 
1023         const int input_start = start * input_size_per_batch;
1024         const int input_end = limit * input_size_per_batch;
1025         for (int64_t index = input_start; index < input_end; index++) {
1026           if (index >= argmax.NumElements()) {
1027             break;
1028           }
1029           int64_t grad_out_index = argmax_flat(index);
1030           if (!include_batch_in_index) {
1031             const int64_t cur_batch = index / input_size_per_batch;
1032             grad_out_index += cur_batch * output_size_per_batch;
1033           }
1034           CHECK(grad_out_index >= output_start && grad_out_index < output_end)
1035               << "Invalid output gradient index: " << grad_out_index << ", "
1036               << output_start << ", " << output_end;
1037           grad_out_flat(grad_out_index) += grad_in_flat(index);
1038         }
1039       }
1040     };
1041 
1042     const int64_t batch_size =
1043         GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
1044     const int64_t shard_cost = grad_out->NumElements() / batch_size;
1045     Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
1046           shard_cost, shard);
1047   }
1048 };
1049 
1050 // TODO(b/175733711): Support int32 argmax type in MaxPoolGradWithArgmax op.
1051 template <typename Device, typename T>
1052 class MaxPoolingGradWithArgmaxOp : public OpKernel {
1053  public:
MaxPoolingGradWithArgmaxOp(OpKernelConstruction * context)1054   explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
1055       : OpKernel(context) {
1056     string data_format_str;
1057     if (std::is_same<Device, GPUDevice>::value) {
1058       OP_REQUIRES(context, !tensorflow::OpDeterminismRequired(),
1059                   errors::Unimplemented("Determinism is not yet supported "
1060                                         "for MaxPoolGradWithArgmax."));
1061     }
1062     auto status = context->GetAttr("data_format", &data_format_str);
1063     if (status.ok()) {
1064       OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
1065                   errors::InvalidArgument("Invalid data format"));
1066     }
1067 
1068     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1069     OP_REQUIRES(context, ksize_.size() == 4,
1070                 errors::InvalidArgument("Sliding window ksize field must "
1071                                         "specify 4 dimensions"));
1072     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1073     OP_REQUIRES(context, stride_.size() == 4,
1074                 errors::InvalidArgument("Sliding window stride field must "
1075                                         "specify 4 dimensions"));
1076     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1077     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1078                 errors::Unimplemented(
1079                     "Pooling is not yet supported on the batch dimension."));
1080     OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1081                                              &include_batch_in_index_));
1082   }
1083 
Compute(OpKernelContext * context)1084   void Compute(OpKernelContext* context) override {
1085     const Tensor& tensor_in = context->input(0);
1086     const Tensor& grad_in = context->input(1);
1087     const Tensor& argmax = context->input(2);
1088 
1089     PoolParameters params{context,
1090                           ksize_,
1091                           stride_,
1092                           padding_,
1093                           /*explicit_paddings=*/{},
1094                           FORMAT_NHWC,
1095                           tensor_in.shape()};
1096     if (!context->status().ok()) {
1097       return;
1098     }
1099 
1100     TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
1101                            params.tensor_in_cols, params.depth});
1102     Tensor* grad_out = nullptr;
1103     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1104                                 {0}, 0, out_shape, &grad_out));
1105 
1106     if (out_shape.num_elements() == 0) return;  // nothing to be done
1107 
1108     LaunchMaxPoolingGradWithArgmax<Device, T>::launch(
1109         context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1110   }
1111 
1112  private:
1113   std::vector<int32> ksize_;
1114   std::vector<int32> stride_;
1115   Padding padding_;
1116   TensorFormat data_format_;
1117   bool include_batch_in_index_;
1118 };
1119 
1120 template <typename Device, typename T>
1121 struct LaunchMaxPoolingGradGradWithArgmax;
1122 
1123 template <typename Device, typename T>
1124 class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
1125  public:
MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction * context)1126   explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
1127       : OpKernel(context) {
1128     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1129     OP_REQUIRES(context, ksize_.size() == 4,
1130                 errors::InvalidArgument("Sliding window ksize field must "
1131                                         "specify 4 dimensions"));
1132     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1133     OP_REQUIRES(context, stride_.size() == 4,
1134                 errors::InvalidArgument("Sliding window stride field must "
1135                                         "specify 4 dimensions"));
1136     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1137     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1138                 errors::Unimplemented(
1139                     "Pooling is not yet supported on the batch dimension."));
1140     OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1141                                              &include_batch_in_index_));
1142   }
1143 
Compute(OpKernelContext * context)1144   void Compute(OpKernelContext* context) override {
1145     const Tensor& tensor_in = context->input(0);
1146     const Tensor& grad_in = context->input(1);
1147     const Tensor& argmax = context->input(2);
1148 
1149     PoolParameters params{context,
1150                           ksize_,
1151                           stride_,
1152                           padding_,
1153                           /*explicit_paddings=*/{},
1154                           FORMAT_NHWC,
1155                           tensor_in.shape()};
1156     if (!context->status().ok()) {
1157       return;
1158     }
1159 
1160     TensorShape out_shape({params.tensor_in_batch, params.out_height,
1161                            params.out_width, params.depth});
1162 
1163     Tensor* grad_out = nullptr;
1164     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1165                                 {0}, 0, out_shape, &grad_out));
1166 
1167     LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
1168         context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1169   }
1170 
1171  private:
1172   std::vector<int32> ksize_;
1173   std::vector<int32> stride_;
1174   Padding padding_;
1175   bool include_batch_in_index_;
1176 };
1177 
1178 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1179 template <typename T>
1180 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
1181  public:
1182   typedef GPUDevice Device;
MaxPoolingNoMaskOp(OpKernelConstruction * context)1183   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
1184       : OpKernel(context) {
1185     string data_format;
1186     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1187     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1188                 errors::InvalidArgument("Invalid data format"));
1189     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1190     OP_REQUIRES(context, ksize_.size() == 4,
1191                 errors::InvalidArgument("Sliding window ksize field must "
1192                                         "specify 4 dimensions"));
1193     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1194     OP_REQUIRES(context, stride_.size() == 4,
1195                 errors::InvalidArgument("Sliding window stride field must "
1196                                         "specify 4 dimensions"));
1197     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1198     OP_REQUIRES_OK(context,
1199                    context->GetAttr("explicit_paddings", &explicit_paddings_));
1200     const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1201     const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
1202     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1203                 errors::Unimplemented(
1204                     "Pooling is not yet supported on the batch dimension."));
1205 
1206     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1207                                    &propagate_nans_));
1208   }
1209 
Compute(OpKernelContext * context)1210   void Compute(OpKernelContext* context) override {
1211     const Tensor& tensor_in = context->input(0);
1212 
1213     PoolParameters params{
1214         context,      ksize_,           stride_, padding_, explicit_paddings_,
1215         data_format_, tensor_in.shape()};
1216     if (!context->status().ok()) {
1217       return;
1218     }
1219 
1220     TensorShape out_shape =
1221         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1222                         params.out_width, params.depth);
1223 
1224     // Assuming qint8 <--> NCHW_VECT_C (int8x4) here.
1225     constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
1226     OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
1227                 errors::InvalidArgument(
1228                     "qint8 should be used with data_format NCHW_VECT_C."));
1229 
1230 #if CUDNN_VERSION >= 7300
1231     DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1232                              stride_, padding_, explicit_paddings_,
1233                              data_format_, tensor_in, out_shape,
1234                              propagate_nans_);
1235 #else
1236     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
1237     if (!is_int8x4 && data_format_ == FORMAT_NCHW) {
1238       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1239                                stride_, padding_, explicit_paddings_,
1240                                data_format_, tensor_in, out_shape,
1241                                propagate_nans_);
1242     } else {
1243 #if !defined(TENSORFLOW_USE_ROCM)
1244       OP_REQUIRES(context, padding_ != EXPLICIT,
1245                   errors::Unimplemented("Explicit padding is not supported ",
1246                                         "when CUDNN is not enabled."));
1247 #endif
1248       Tensor* output = nullptr;
1249       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1250       if (is_int8x4) {
1251         LaunchMaxPoolingNoMask_NCHW_VECT_C<Device>::launch(context, params,
1252                                                            tensor_in, output);
1253       } else if (data_format_ == FORMAT_NHWC) {
1254         LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1255                                                   output, propagate_nans_);
1256       } else {
1257         LOG(FATAL) << "MaxPool currently only supports the following (layout, "
1258                       "type) combinations: (NHWC, non-qint8), "
1259                       "(NCHW, non-qint8) or (NCHW_VECT_C, qint8). The "
1260                       "requested combination ("
1261                    << ToString(data_format_) << ", "
1262                    << DataTypeString(DataTypeToEnum<T>::v())
1263                    << ") is not supported.";
1264       }
1265     }
1266 #endif
1267   }
1268 
1269  private:
1270   std::vector<int32> ksize_;
1271   std::vector<int32> stride_;
1272   Padding padding_;
1273   std::vector<int64> explicit_paddings_;
1274   TensorFormat data_format_;
1275   bool propagate_nans_;
1276 };
1277 
1278 template <typename T>
1279 class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
1280  public:
1281   typedef GPUDevice Device;
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)1282   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
1283       : OpKernel(context) {
1284     string data_format;
1285     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1286     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1287                 errors::InvalidArgument("Invalid data format"));
1288     if (context->num_inputs() == 1) {
1289       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1290       OP_REQUIRES(context, ksize_.size() == 4,
1291                   errors::InvalidArgument("Sliding window ksize field must "
1292                                           "specify 4 dimensions"));
1293       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1294       OP_REQUIRES(context, stride_.size() == 4,
1295                   errors::InvalidArgument("Sliding window stride field must "
1296                                           "specify 4 dimensions"));
1297       const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1298       const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
1299       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1300                   errors::Unimplemented(
1301                       "Pooling is not yet supported on the batch dimension."));
1302     }
1303     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1304     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1305                                    &propagate_nans_));
1306   }
1307 
Compute(OpKernelContext * context)1308   void Compute(OpKernelContext* context) override {
1309     const Tensor& tensor_in = context->input(0);
1310 
1311     std::vector<int32> ksize = ksize_;
1312     std::vector<int32> stride = stride_;
1313 
1314     if (context->num_inputs() != 1) {
1315       const Tensor& tensor_ksize = context->input(1);
1316       auto value_ksize = tensor_ksize.flat<int32>();
1317       ksize.resize(tensor_ksize.shape().num_elements());
1318       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
1319 
1320       const Tensor& tensor_stride = context->input(2);
1321       auto value_stride = tensor_stride.flat<int32>();
1322       stride.resize(tensor_stride.shape().num_elements());
1323       std::copy_n(&value_stride(0), stride.size(), stride.begin());
1324     }
1325     OP_REQUIRES(context, ksize.size() == 4,
1326                 errors::InvalidArgument("Sliding window ksize field must "
1327                                         "specify 4 dimensions"));
1328     OP_REQUIRES(context, stride.size() == 4,
1329                 errors::InvalidArgument("Sliding window stride field must "
1330                                         "specify 4 dimensions"));
1331     const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N');
1332     const int32_t stride_n = GetTensorDim(stride, data_format_, 'N');
1333     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1334                 errors::Unimplemented(
1335                     "Pooling is not yet supported on the batch dimension."));
1336 
1337     PoolParameters params{context,
1338                           ksize,
1339                           stride,
1340                           padding_,
1341                           /*explicit_paddings=*/{},
1342                           data_format_,
1343                           tensor_in.shape()};
1344     if (!context->status().ok()) {
1345       return;
1346     }
1347 
1348     TensorShape out_shape =
1349         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1350                         params.out_width, params.depth);
1351     if (data_format_ == FORMAT_NCHW) {
1352       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
1353                                stride, padding_, explicit_paddings_,
1354                                data_format_, tensor_in, out_shape,
1355                                propagate_nans_);
1356     } else {
1357       CHECK(data_format_ == FORMAT_NHWC)
1358           << "MaxPool only supports NCHW or NHWC format";
1359       Tensor* output = nullptr;
1360       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1361       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1362                                                 output, propagate_nans_);
1363     }
1364   }
1365 
1366  private:
1367   std::vector<int32> ksize_;
1368   std::vector<int32> stride_;
1369   Padding padding_;
1370   std::vector<int64> explicit_paddings_;
1371   TensorFormat data_format_;
1372   bool propagate_nans_;
1373 };
1374 
1375 template <typename T>
1376 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingNoMask1377   static void launch(OpKernelContext* context, const PoolParameters& params,
1378                      const Tensor& input, Tensor* output, bool propagate_nans) {
1379     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1380         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1381         params.tensor_in_cols, params.depth, params.out_height,
1382         params.out_width, params.window_rows, params.window_cols,
1383         params.row_stride, params.col_stride, params.pad_top, params.pad_left,
1384         output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
1385         propagate_nans, false);
1386     if (!status) {
1387       context->SetStatus(
1388           errors::Internal("Failed launching MaxPoolForwardNoMask"));
1389     }
1390   }
1391 };
1392 
1393 template <typename T>
1394 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T, int64> {
launchtensorflow::LaunchMaxPoolingWithArgmax1395   static void launch(OpKernelContext* context, const PoolParameters& params,
1396                      const Tensor& input, Tensor* output, Tensor* argmax,
1397                      bool propagate_nans, bool include_batch_in_index) {
1398     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1399         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1400         params.tensor_in_cols, params.depth, params.out_height,
1401         params.out_width, params.window_rows, params.window_cols,
1402         params.row_stride, params.col_stride, params.pad_top, params.pad_left,
1403         output->flat<T>().data(),
1404         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
1405         context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
1406     if (!status) {
1407       context->SetStatus(
1408           errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
1409     }
1410   }
1411 };
1412 
1413 template <typename T>
1414 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradWithArgmax1415   static void launch(OpKernelContext* context, const PoolParameters& params,
1416                      const Tensor& grad_in, const Tensor& argmax,
1417                      Tensor* grad_out, const bool include_batch_in_index) {
1418     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1419                            params.tensor_in_cols * params.depth;
1420     const int output_size = params.tensor_in_batch * params.out_height *
1421                             params.out_width * params.depth;
1422     const int top_offset = params.out_height * params.out_width * params.depth;
1423     const int bottom_offset =
1424         params.tensor_in_rows * params.tensor_in_cols * params.depth;
1425     bool status = functor::MaxPoolBackwardWithArgmax<T>()(
1426         output_size, input_size, grad_in.flat<T>().data(),
1427         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1428         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1429         include_batch_in_index);
1430     if (!status) {
1431       context->SetStatus(
1432           errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
1433     }
1434   }
1435 };
1436 
1437 template <typename T>
1438 struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradGradWithArgmax1439   static void launch(OpKernelContext* context, const PoolParameters& params,
1440                      const Tensor& grad_in, const Tensor& argmax,
1441                      Tensor* grad_out, const bool include_batch_in_index) {
1442     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1443                            params.tensor_in_cols * params.depth;
1444     const int output_size = params.tensor_in_batch * params.out_height *
1445                             params.out_width * params.depth;
1446     const int top_offset =
1447         params.tensor_in_rows * params.tensor_in_cols * params.depth;
1448     const int bottom_offset =
1449         params.out_width * params.out_height * params.depth;
1450     bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
1451         output_size, input_size, grad_in.flat<T>().data(),
1452         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1453         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1454         include_batch_in_index);
1455     if (!status) {
1456       context->SetStatus(
1457           errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
1458     }
1459   }
1460 };
1461 
1462 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1463 
1464 #define REGISTER_MAX_POOL_KERNELS(D, T)                                  \
1465   REGISTER_KERNEL_BUILDER(                                               \
1466       Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"),     \
1467       MaxPoolingGradOp<D##Device, T>);                                   \
1468   REGISTER_KERNEL_BUILDER(                                               \
1469       Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
1470       MaxPoolingGradGradOp<D##Device, T>);                               \
1471   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2")                          \
1472                               .Device(DEVICE_##D)                        \
1473                               .HostMemory("ksize")                       \
1474                               .HostMemory("strides")                     \
1475                               .TypeConstraint<T>("T"),                   \
1476                           MaxPoolingGradOp<D##Device, T>);               \
1477   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2")                      \
1478                               .Device(DEVICE_##D)                        \
1479                               .HostMemory("ksize")                       \
1480                               .HostMemory("strides")                     \
1481                               .TypeConstraint<T>("T"),                   \
1482                           MaxPoolingGradGradOp<D##Device, T>)            \
1483   REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                      \
1484                               .Device(DEVICE_##D)                        \
1485                               .TypeConstraint<int64>("Targmax")          \
1486                               .TypeConstraint<T>("T"),                   \
1487                           MaxPoolingWithArgmaxOp<D##Device, T, int64>);  \
1488   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")                  \
1489                               .Device(DEVICE_##D)                        \
1490                               .TypeConstraint<T>("T")                    \
1491                               .TypeConstraint<int64>("Targmax"),         \
1492                           MaxPoolingGradWithArgmaxOp<D##Device, T>);
1493 
1494 // Below kernels implemented only for CPU device.
1495 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
1496   REGISTER_KERNEL_BUILDER(                                         \
1497       Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
1498       MaxPoolingOp<CPUDevice, T>);                                 \
1499   REGISTER_KERNEL_BUILDER(                                         \
1500       Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
1501       MaxPoolingV2Op<CPUDevice, T>);                               \
1502   REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                \
1503                               .Device(DEVICE_CPU)                  \
1504                               .TypeConstraint<int32>("Targmax")    \
1505                               .TypeConstraint<T>("T"),             \
1506                           MaxPoolingWithArgmaxOp<CPUDevice, T, int32>);
1507 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
1508 #undef REGISTER_CPU_ONLY_POOL_KERNELS
1509 
1510 #define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
1511 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
1512 #undef REGISTER_CPU_KERNELS
1513 
1514 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1515 
1516 // Forward declarations for the functor specializations for GPU.
1517 namespace functor {
1518 #define DECLARE_GPU_SPEC(T)                                            \
1519   template <>                                                          \
1520   void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
1521       const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
1522       typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
1523       int window_cols, int row_stride, int col_stride,                 \
1524       const Eigen::PaddingType& padding);                              \
1525   extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
1526 
1527 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
1528 #undef DECLARE_GPU_SPEC
1529 }  // namespace functor
1530 
1531 #define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
1532 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
1533 #undef REGISTER_GPU_MAX_POOL_KERNELS
1534 
1535 // Below kernels currently implemented only for GPU device.
1536 // Note(jiayq): Currently, the Caffe custom implementation is faster than the
1537 // default Eigen implementation so we are using the custom kernel as the
1538 // default. However, you can explicitly invoke the eigen version using
1539 // kernel_label_map.
1540 #define REGISTER_GPU_ONLY_POOL_KERNELS(T)                        \
1541   REGISTER_KERNEL_BUILDER(Name("MaxPool")                        \
1542                               .Device(DEVICE_GPU)                \
1543                               .TypeConstraint<T>("T")            \
1544                               .Label("eigen_tensor"),            \
1545                           MaxPoolingOp<GPUDevice, T>);           \
1546   REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
1547                               .Device(DEVICE_GPU)                \
1548                               .HostMemory("ksize")               \
1549                               .HostMemory("strides")             \
1550                               .TypeConstraint<T>("T")            \
1551                               .Label("eigen_tensor"),            \
1552                           MaxPoolingV2Op<GPUDevice, T>);         \
1553   REGISTER_KERNEL_BUILDER(                                       \
1554       Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
1555       MaxPoolingNoMaskOp<GPUDevice, T>);                         \
1556   REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
1557                               .Device(DEVICE_GPU)                \
1558                               .HostMemory("ksize")               \
1559                               .HostMemory("strides")             \
1560                               .TypeConstraint<T>("T"),           \
1561                           MaxPoolingNoMaskV2Op<GPUDevice, T>);   \
1562   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")      \
1563                               .Device(DEVICE_GPU)                \
1564                               .TypeConstraint<T>("T")            \
1565                               .TypeConstraint<int64>("Targmax"), \
1566                           MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
1567 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
1568 
1569 // TODO(b/65847473): Re-enable once the underlying build error is fixed.
1570 #if !defined(PLATFORM_WINDOWS)
1571 REGISTER_KERNEL_BUILDER(
1572     Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
1573     MaxPoolingNoMaskOp<GPUDevice, qint8>);
1574 
1575 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1576                             .Device(DEVICE_GPU)
1577                             .HostMemory("ksize")
1578                             .HostMemory("strides")
1579                             .TypeConstraint<qint8>("T"),
1580                         MaxPoolingV2Op<GPUDevice, qint8>);
1581 
1582 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1583                             .Device(DEVICE_GPU)
1584                             .HostMemory("ksize")
1585                             .HostMemory("strides")
1586                             .TypeConstraint<qint8>("T")
1587                             .Label("eigen_tensor"),
1588                         MaxPoolingV2Op<GPUDevice, qint8>);
1589 #endif  // !defined(PLATFORM_WINDOWS)
1590 
1591 #undef REGISTER_GPU_ONLY_POOL_KERNELS
1592 
1593 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1594 
1595 #undef REGISTER_MAX_POOL_KERNELS
1596 
1597 }  // namespace tensorflow
1598