• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/nn_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #include "tensorflow/core/kernels/maxpooling_op.h"
21 
22 #include <vector>
23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
24 #include "tensorflow/core/common_runtime/device.h"
25 #include "tensorflow/core/framework/bounds_check.h"
26 #include "tensorflow/core/framework/numeric_op.h"
27 #include "tensorflow/core/framework/op_kernel.h"
28 #include "tensorflow/core/framework/register_types.h"
29 #include "tensorflow/core/framework/tensor.h"
30 #include "tensorflow/core/framework/tensor_shape.h"
31 #include "tensorflow/core/framework/tensor_slice.h"
32 #include "tensorflow/core/kernels/conv_2d.h"
33 #include "tensorflow/core/kernels/eigen_pooling.h"
34 #include "tensorflow/core/kernels/ops_util.h"
35 #include "tensorflow/core/kernels/pooling_ops_common.h"
36 #include "tensorflow/core/lib/core/errors.h"
37 #include "tensorflow/core/lib/gtl/array_slice.h"
38 #include "tensorflow/core/util/env_var.h"
39 #include "tensorflow/core/util/padding.h"
40 #include "tensorflow/core/util/tensor_format.h"
41 #include "tensorflow/core/util/use_cudnn.h"
42 
43 #if GOOGLE_CUDA
44 #include "cuda/include/cudnn.h"
45 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
46 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
47 #include "tensorflow/core/platform/stream_executor.h"
48 #endif  // GOOGLE_CUDA
49 
50 namespace tensorflow {
51 
52 typedef Eigen::ThreadPoolDevice CPUDevice;
53 typedef Eigen::GpuDevice GPUDevice;
54 
55 const int kInvalidMaxPoolingIndex = -1;
56 
57 template <typename Device, typename T>
SpatialMaxPoolWithArgMaxHelper(OpKernelContext * context,Tensor * output,Tensor * output_arg_max,Tensor * input_backprop,const Tensor & tensor_in,const Tensor & out_backprop,const PoolParameters & params,const bool include_batch_in_index)58 static void SpatialMaxPoolWithArgMaxHelper(
59     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
60     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
61     const PoolParameters& params, const bool include_batch_in_index) {
62   if (input_backprop != nullptr) {
63     OP_REQUIRES(
64         context, include_batch_in_index,
65         errors::Internal(
66             "SpatialMaxPoolWithArgMaxHelper requires include_batch_in_index "
67             "to be True when when input_backprop != nullptr"));
68   }
69 
70   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
71       ConstEigenMatrixMap;
72   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
73       EigenMatrixMap;
74   typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>>
75       EigenIndexMatrixMap;
76 
77   ConstEigenMatrixMap in_mat(
78       tensor_in.flat<T>().data(), params.depth,
79       params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
80   EigenMatrixMap out_mat(
81       output->flat<T>().data(), params.depth,
82       params.out_width * params.out_height * params.tensor_in_batch);
83   EigenIndexMatrixMap out_arg_max_mat(
84       output_arg_max->flat<int64>().data(), params.depth,
85       params.out_width * params.out_height * params.tensor_in_batch);
86 
87   const DeviceBase::CpuWorkerThreads& worker_threads =
88       *(context->device()->tensorflow_cpu_worker_threads());
89 
90   // The following code basically does the following:
91   // 1. Flattens the input and output tensors into two dimensional arrays.
92   //    tensor_in_as_matrix:
93   //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
94   //    output_as_matrix:
95   //      depth by (out_width * out_height * tensor_in_batch)
96   //
97   // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
98   //    and updates the corresponding column(s) in output_as_matrix with the
99   //    max value.
100   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
101                 &output_arg_max, &out_backprop,
102                 include_batch_in_index](int64 start, int64 limit) {
103     const int32 depth = params.depth;
104     const int32 in_rows = params.tensor_in_rows;
105     const int32 in_cols = params.tensor_in_cols;
106     const int32 pad_rows = params.pad_rows;
107     const int32 pad_cols = params.pad_cols;
108     const int32 window_rows = params.window_rows;
109     const int32 window_cols = params.window_cols;
110     const int32 row_stride = params.row_stride;
111     const int32 col_stride = params.col_stride;
112     const int32 out_height = params.out_height;
113     const int32 out_width = params.out_width;
114 
115     {
116       // Initializes the output tensor with MIN<T>.
117       const int32 output_image_size = out_height * out_width * depth;
118       EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1,
119                                (limit - start) * output_image_size);
120       out_shard.setConstant(Eigen::NumTraits<T>::lowest());
121       EigenIndexMatrixMap out_arg_max_shard(
122           out_arg_max_mat.data() + start * output_image_size, 1,
123           (limit - start) * output_image_size);
124       out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
125     }
126 
127     for (int64 b = start; b < limit; ++b) {
128       for (int h = 0; h < in_rows; ++h) {
129         for (int w = 0; w < in_cols; ++w) {
130           // (h_start, h_end) * (w_start, w_end) is the range that the input
131           // vector projects to.
132           const int hpad = h + pad_rows;
133           const int wpad = w + pad_cols;
134           const int h_start =
135               (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
136           const int h_end = std::min(hpad / row_stride + 1, out_height);
137           const int w_start =
138               (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
139           const int w_end = std::min(wpad / col_stride + 1, out_width);
140           // compute elementwise max
141           const int64 in_index = (b * in_rows + h) * in_cols + w;
142           for (int ph = h_start; ph < h_end; ++ph) {
143             const int64 out_index_base = (b * out_height + ph) * out_width;
144             for (int pw = w_start; pw < w_end; ++pw) {
145               const int64 out_index = out_index_base + pw;
146               /// NOTES(zhengxq): not using the eigen matrix operation for
147               /// now.
148               for (int d = 0; d < depth; ++d) {
149                 const T& input_ref = in_mat.coeffRef(d, in_index);
150                 T& output_ref = out_mat.coeffRef(d, out_index);
151                 int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
152                 if (output_ref < input_ref ||
153                     out_arg_max_ref == kInvalidMaxPoolingIndex) {
154                   output_ref = input_ref;
155                   if (include_batch_in_index) {
156                     out_arg_max_ref = in_index * depth + d;
157                   } else {
158                     out_arg_max_ref = (h * in_cols + w) * depth + d;
159                   }
160                 }
161               }
162             }
163           }
164         }
165       }
166     }
167 
168     if (input_backprop != nullptr) {
169       auto input_backprop_flat = input_backprop->flat<T>();
170       auto out_arg_max_flat = output_arg_max->flat<int64>();
171       auto out_backprop_flat = out_backprop.flat<T>();
172 
173       // Initialize output to 0.
174       const int64 in_size = in_rows * in_cols * depth;
175       const int64 in_start = start * in_size;
176       const int64 in_end = limit * in_size;
177       EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
178                               in_end - in_start);
179       in_shard.setConstant(T(0));
180 
181       // Backpropagate.
182       const int out_size = out_height * out_width * depth;
183       const int out_start = start * out_size;
184       const int out_end = limit * out_size;
185       for (int index = out_start; index < out_end; ++index) {
186         int input_backprop_index = out_arg_max_flat(index);
187         // Although this check is in the inner loop, it is worth its value
188         // so we don't end up with memory corruptions. Our benchmark shows that
189         // the performance impact is quite small
190         // CHECK(input_backprop_index >= in_start && input_backprop_index <
191         // in_end)
192         FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
193         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
194       }
195     }
196   };
197 
198   const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
199                            params.depth * params.window_rows *
200                            params.window_cols;
201   Shard(worker_threads.num_threads, worker_threads.workers,
202         params.tensor_in_batch, shard_cost, shard);
203 }
204 
205 // The operation to compute MaxPool gradients.
206 // It takes three inputs:
207 //   - The original input tensor
208 //   - The original output tensor
209 //   - Backprop tensor for output
210 // It produces one output: backprop tensor for input.
211 template <class Device, class T>
212 class MaxPoolingGradOp : public OpKernel {
213  public:
MaxPoolingGradOp(OpKernelConstruction * context)214   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
215     string data_format;
216     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
217     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
218                 errors::InvalidArgument("Invalid data format"));
219     OP_REQUIRES(
220         context, data_format_ == FORMAT_NHWC,
221         errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
222                                 "on device type ",
223                                 DeviceTypeString(context->device_type())));
224 
225     if (context->num_inputs() == 3) {
226       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
227       OP_REQUIRES(context, ksize_.size() == 4,
228                   errors::InvalidArgument("Sliding window ksize field must "
229                                           "specify 4 dimensions"));
230       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
231       OP_REQUIRES(context, stride_.size() == 4,
232                   errors::InvalidArgument("Sliding window strides field must "
233                                           "specify 4 dimensions"));
234       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
235                   errors::Unimplemented(
236                       "Pooling is not yet supported on the batch dimension."));
237       OP_REQUIRES(
238           context, ksize_[3] == 1 && stride_[3] == 1,
239           errors::Unimplemented(
240               "MaxPoolingGrad is not yet supported on the depth dimension."));
241     }
242 
243     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
244   }
245 
Compute(OpKernelContext * context)246   void Compute(OpKernelContext* context) override {
247     const Tensor& tensor_in = context->input(0);
248     const Tensor& tensor_out = context->input(1);
249     const Tensor& out_backprop = context->input(2);
250 
251     // For maxpooling, tensor_in should have 4 dimensions.
252     OP_REQUIRES(context, tensor_in.dims() == 4,
253                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
254     OP_REQUIRES(context, tensor_out.dims() == 4,
255                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
256     // For maxpooling, out_backprop should have 4 dimensions.
257     OP_REQUIRES(context, out_backprop.dims() == 4,
258                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
259 
260     const TensorShape& output_shape = tensor_in.shape();
261 
262     Tensor tensor_out_dup;
263     OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
264                                 {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
265                                 &tensor_out_dup));
266     Tensor tensor_out_arg_max;
267     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
268                                                    tensor_out.shape(),
269                                                    &tensor_out_arg_max));
270     std::vector<int32> ksize = ksize_;
271     std::vector<int32> stride = stride_;
272     if (context->num_inputs() == 5) {
273       const Tensor& tensor_ksize = context->input(3);
274       auto value_ksize = tensor_ksize.flat<int32>();
275       ksize.resize(tensor_ksize.shape().num_elements());
276       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
277 
278       const Tensor& tensor_stride = context->input(4);
279       auto value_stride = tensor_stride.flat<int32>();
280       stride.resize(tensor_stride.shape().num_elements());
281       std::copy_n(&value_stride(0), stride.size(), stride.begin());
282     }
283 
284     OP_REQUIRES(context, ksize.size() == 4,
285                 errors::InvalidArgument("Sliding window ksize field must "
286                                         "specify 4 dimensions"));
287     OP_REQUIRES(context, stride.size() == 4,
288                 errors::InvalidArgument("Sliding window strides field must "
289                                         "specify 4 dimensions"));
290     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
291                 errors::Unimplemented(
292                     "Pooling is not yet supported on the batch dimension."));
293     OP_REQUIRES(
294         context, ksize[3] == 1 && stride[3] == 1,
295         errors::Unimplemented(
296             "MaxPoolingGrad is not yet supported on the depth dimension."));
297 
298     PoolParameters params{context,  ksize,       stride,
299                           padding_, FORMAT_NHWC, tensor_in.shape()};
300     if (!context->status().ok()) {
301       return;
302     }
303 
304     Tensor* output = nullptr;
305     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
306                                 {0}, 0, output_shape, &output));
307 
308     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
309         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
310         out_backprop, params, true);
311   }
312 
313  private:
314   std::vector<int32> ksize_;
315   std::vector<int32> stride_;
316   Padding padding_;
317   TensorFormat data_format_;
318 };
319 
320 #ifdef GOOGLE_CUDA
321 
322 template <typename T>
MaxPoolingBackwardCustomKernel(OpKernelContext * context,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,const Tensor * tensor_in,const Tensor & out_backprop,const TensorShape & tensor_in_shape)323 static void MaxPoolingBackwardCustomKernel(
324     OpKernelContext* context, const std::vector<int32>& size,
325     const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
326     const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
327   Tensor* output = nullptr;
328   OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
329                               {0}, 0, tensor_in_shape, &output));
330 
331   PoolParameters params{context, size,        stride,
332                         padding, FORMAT_NHWC, tensor_in_shape};
333   if (!context->status().ok()) {
334     return;
335   }
336 
337   functor::MaxPoolBackwardNoMask<T>()(
338       tensor_in->flat<T>().data(), params.tensor_in_batch,
339       params.tensor_in_rows, params.tensor_in_cols, params.depth,
340       params.out_height, params.out_width, params.window_rows,
341       params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
342       params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
343       context->eigen_device<Eigen::GpuDevice>());
344 }
345 
346 template <class T>
347 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
348  public:
349   typedef Eigen::GpuDevice Device;
350 
MaxPoolingGradOp(OpKernelConstruction * context)351   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
352     string data_format;
353     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
354     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
355                 errors::InvalidArgument("Invalid data format"));
356     if (context->num_inputs() == 3) {
357       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
358       OP_REQUIRES(context, ksize_.size() == 4,
359                   errors::InvalidArgument("Sliding window ksize field must "
360                                           "specify 4 dimensions"));
361       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
362       OP_REQUIRES(context, stride_.size() == 4,
363                   errors::InvalidArgument("Sliding window strides field must "
364                                           "specify 4 dimensions"));
365       const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
366       const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
367       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
368                   errors::Unimplemented(
369                       "Pooling is not yet supported on the batch dimension."));
370     }
371     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
372 
373     use_dnn_ = CanUseCudnn();
374     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
375                                    &propagate_nans_));
376   }
377 
Compute(OpKernelContext * context)378   void Compute(OpKernelContext* context) override {
379     const Tensor& tensor_in = context->input(0);
380     const Tensor& tensor_out = context->input(1);
381     const Tensor& out_backprop = context->input(2);
382 
383     // For maxpooling, tensor_in should have 4 dimensions.
384     OP_REQUIRES(context, tensor_in.dims() == 4,
385                 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
386     OP_REQUIRES(context, tensor_out.dims() == 4,
387                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
388     // For maxpooling, out_backprop should have 4 dimensions.
389     OP_REQUIRES(context, out_backprop.dims() == 4,
390                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
391 
392     TensorShape output_shape = tensor_in.shape();
393 
394     std::vector<int32> ksize = ksize_;
395     std::vector<int32> stride = stride_;
396     if (context->num_inputs() == 5) {
397       const Tensor& tensor_ksize = context->input(3);
398       auto value_ksize = tensor_ksize.flat<int32>();
399       ksize.resize(tensor_ksize.shape().num_elements());
400       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
401 
402       const Tensor& tensor_stride = context->input(4);
403       auto value_stride = tensor_stride.flat<int32>();
404       stride.resize(tensor_stride.shape().num_elements());
405       std::copy_n(&value_stride(0), stride.size(), stride.begin());
406     }
407     OP_REQUIRES(context, ksize.size() == 4,
408                 errors::InvalidArgument("Sliding window ksize field must "
409                                         "specify 4 dimensions"));
410     OP_REQUIRES(context, stride.size() == 4,
411                 errors::InvalidArgument("Sliding window strides field must "
412                                         "specify 4 dimensions"));
413     const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
414     const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
415     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
416                 errors::Unimplemented(
417                     "Pooling is not yet supported on the batch dimension."));
418 
419     if (use_dnn_) {
420       DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
421                                    ksize, stride, padding_, data_format_,
422                                    &tensor_in, &tensor_out, out_backprop,
423                                    output_shape, propagate_nans_);
424     } else {
425       CHECK(data_format_ == FORMAT_NHWC)
426           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
427       MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_,
428                                         &tensor_in, out_backprop, output_shape);
429     }
430   }
431 
432  private:
433   std::vector<int32> ksize_;
434   std::vector<int32> stride_;
435   Padding padding_;
436   TensorFormat data_format_;
437   bool use_dnn_;
438   bool propagate_nans_;
439 };
440 
441 #endif  // GOOGLE_CUDA
442 
443 // The operation to compute gradient of MaxPool gradients.
444 // It takes three inputs:
445 //   - The original input tensor
446 //   - The original output tensor
447 //   - Backprop tensor for output gradients
448 // It produces one output: backprop tensor for output gradient.
449 template <class Device, class T>
450 class MaxPoolingGradGradOp : public OpKernel {
451  public:
MaxPoolingGradGradOp(OpKernelConstruction * context)452   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
453       : OpKernel(context) {
454     string data_format;
455     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
456     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
457                 errors::InvalidArgument("Invalid data format"));
458     OP_REQUIRES(
459         context, data_format_ == FORMAT_NHWC,
460         errors::InvalidArgument(
461             "Default MaxPoolingGradGradOp only supports NHWC ",
462             "on device type ", DeviceTypeString(context->device_type())));
463 
464     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
465 
466     if (context->num_inputs() == 3) {
467       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
468       OP_REQUIRES(context, ksize_.size() == 4,
469                   errors::InvalidArgument("Sliding window ksize field must "
470                                           "specify 4 dimensions"));
471       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
472       OP_REQUIRES(context, stride_.size() == 4,
473                   errors::InvalidArgument("Sliding window strides field must "
474                                           "specify 4 dimensions"));
475       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
476                   errors::Unimplemented(
477                       "Pooling is not yet supported on the batch dimension."));
478       OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1,
479                   errors::Unimplemented("MaxPoolingGradGrad is not yet "
480                                         "supported on the depth dimension."));
481     }
482   }
483 
Compute(OpKernelContext * context)484   void Compute(OpKernelContext* context) override {
485     const Tensor& tensor_in = context->input(0);
486     const Tensor& tensor_out = context->input(1);
487     const Tensor& out_grad_backprop = context->input(2);
488 
489     // For maxpooling, tensor_in should have 4 dimensions.
490     OP_REQUIRES(context, tensor_in.dims() == 4,
491                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
492     OP_REQUIRES(context, tensor_out.dims() == 4,
493                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
494     // For maxpooling, out_grad_backprop should have 4 dimensions.
495     OP_REQUIRES(
496         context, out_grad_backprop.dims() == 4,
497         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
498 
499     std::vector<int32> ksize = ksize_;
500     std::vector<int32> stride = stride_;
501     if (context->num_inputs() == 5) {
502       const Tensor& tensor_ksize = context->input(3);
503       auto value_ksize = tensor_ksize.flat<int32>();
504       ksize.resize(tensor_ksize.shape().num_elements());
505       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
506 
507       const Tensor& tensor_stride = context->input(4);
508       auto value_stride = tensor_stride.flat<int32>();
509       stride.resize(tensor_stride.shape().num_elements());
510       std::copy_n(&value_stride(0), stride.size(), stride.begin());
511     }
512 
513     OP_REQUIRES(context, ksize.size() == 4,
514                 errors::InvalidArgument("Sliding window ksize field must "
515                                         "specify 4 dimensions"));
516     OP_REQUIRES(context, stride.size() == 4,
517                 errors::InvalidArgument("Sliding window strides field must "
518                                         "specify 4 dimensions"));
519     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
520                 errors::Unimplemented(
521                     "Pooling is not yet supported on the batch dimension."));
522     OP_REQUIRES(
523         context, ksize[3] == 1 && stride[3] == 1,
524         errors::Unimplemented(
525             "MaxPoolingGrad is not yet supported on the depth dimension."));
526 
527     PoolParameters params{context,  ksize,       stride,
528                           padding_, FORMAT_NHWC, tensor_in.shape()};
529     Tensor* output = nullptr;
530     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
531                                 {2}, 0, tensor_out.shape(), &output));
532 
533     SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
534                            out_grad_backprop, params, padding_);
535   }
536 
537  private:
SpatialMaxPoolGradGrad(OpKernelContext * context,Tensor * bottom_diff,const Tensor & tensor_in,const Tensor & tensor_out,const Tensor & top_diff,const PoolParameters & params,const Padding & padding)538   void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
539                               const Tensor& tensor_in, const Tensor& tensor_out,
540                               const Tensor& top_diff,
541                               const PoolParameters& params,
542                               const Padding& padding) {
543     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
544         ConstEigenMatrixMap;
545     typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
546         EigenMatrixMap;
547 
548     ConstEigenMatrixMap in_mat(
549         tensor_in.flat<T>().data(), params.depth,
550         params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
551     ConstEigenMatrixMap out_mat(
552         tensor_out.flat<T>().data(), params.depth,
553         params.out_width * params.out_height * params.tensor_in_batch);
554     ConstEigenMatrixMap top_diff_mat(
555         top_diff.flat<T>().data(), params.depth,
556         params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
557     EigenMatrixMap bottom_diff_mat(
558         bottom_diff->flat<T>().data(), params.depth,
559         params.out_width * params.out_height * params.tensor_in_batch);
560 
561     const DeviceBase::CpuWorkerThreads& worker_threads =
562         *(context->device()->tensorflow_cpu_worker_threads());
563 
564     // The following code basically does the following:
565     // 1. Flattens the input, output, top_diff and bottom_diff tensors into
566     //    two dimensional arrays.
567     //    tensor_in_as_matrix:
568     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
569     //    tensor_out_as_matrix:
570     //      depth by (out_width * out_height * tensor_in_batch)
571     //    top_diff_as_matrix:
572     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
573     //    bottom_diff_as_matrix:
574     //      depth by (out_width * out_height * tensor_in_batch)
575     //
576     // 2. Walks through the set of columns in the flattened
577     //    tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
578     //    and updates the column(s) corresponding to the maximum values in
579     //    tensor_out_as_matrix with the corresponding values in
580     //    top_diff_as_matrix.
581     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
582                      int64 start, int64 limit) {
583       const int32 depth = params.depth;
584       const int32 in_rows = params.tensor_in_rows;
585       const int32 in_cols = params.tensor_in_cols;
586       const int32 pad_rows = params.pad_rows;
587       const int32 pad_cols = params.pad_cols;
588       const int32 window_rows = params.window_rows;
589       const int32 window_cols = params.window_cols;
590       const int32 row_stride = params.row_stride;
591       const int32 col_stride = params.col_stride;
592       const int32 out_height = params.out_height;
593       const int32 out_width = params.out_width;
594 
595       {
596         // Initializes the output grad backprop tensor with 0.
597         const int32 output_image_size = out_height * out_width * params.depth;
598         EigenMatrixMap bottom_diff_shard(
599             bottom_diff_mat.data() + start * output_image_size, 1,
600             (limit - start) * output_image_size);
601         bottom_diff_shard.setZero();
602       }
603 
604       for (int b = start; b < limit; ++b) {
605         for (int ph = 0; ph < out_height; ++ph) {
606           for (int pw = 0; pw < out_width; ++pw) {
607             // (h_start, h_end) * (w_start, w_end) is the range that the input
608             // vector projects to.
609             int h_start = ph * row_stride - pad_rows;
610             const int h_end = std::min(h_start + window_rows, in_rows);
611             int w_start = pw * col_stride - pad_cols;
612             const int w_end = std::min(w_start + window_cols, in_cols);
613             h_start = std::max(h_start, 0);
614             w_start = std::max(w_start, 0);
615             const int out_index = (b * out_height + ph) * out_width + pw;
616             // Find value corresponding to the input maximum in top_diff.
617             for (int d = 0; d < depth; ++d) {
618               const T& output_ref = out_mat.coeffRef(d, out_index);
619               bool should_stop = false;
620               for (int h = h_start; h < h_end && !should_stop; ++h) {
621                 for (int w = w_start; w < w_end && !should_stop; ++w) {
622                   const int in_index = (b * in_rows + h) * in_cols + w;
623                   const T& input_ref = in_mat.coeffRef(d, in_index);
624                   if (output_ref == input_ref) {
625                     T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
626                     bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
627                     should_stop = true;
628                   }
629                 }
630               }
631             }
632           }
633         }
634       }
635     };
636 
637     const int64 shard_cost = params.out_width * params.out_height *
638                              params.depth * params.window_rows *
639                              params.window_cols;
640     Shard(worker_threads.num_threads, worker_threads.workers,
641           params.tensor_in_batch, shard_cost, shard);
642   }
643 
644   std::vector<int32> ksize_;
645   std::vector<int32> stride_;
646   Padding padding_;
647   TensorFormat data_format_;
648 };
649 
650 #ifdef GOOGLE_CUDA
651 
652 template <class T>
653 class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
654  public:
655   typedef Eigen::GpuDevice Device;
656 
MaxPoolingGradGradOp(OpKernelConstruction * context)657   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
658       : OpKernel(context) {
659     string data_format;
660     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
661     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
662                 errors::InvalidArgument("Invalid data format"));
663     if (context->num_inputs() == 3) {
664       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
665       OP_REQUIRES(context, ksize_.size() == 4,
666                   errors::InvalidArgument("Sliding window ksize field must "
667                                           "specify 4 dimensions"));
668       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
669       OP_REQUIRES(context, stride_.size() == 4,
670                   errors::InvalidArgument("Sliding window strides field must "
671                                           "specify 4 dimensions"));
672       const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
673       const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
674       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
675                   errors::Unimplemented(
676                       "Pooling is not yet supported on the batch dimension."));
677     }
678     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
679   }
680 
Compute(OpKernelContext * context)681   void Compute(OpKernelContext* context) override {
682     const Tensor& tensor_in = context->input(0);
683     const Tensor& tensor_out = context->input(1);
684     const Tensor& out_grad_backprop = context->input(2);
685 
686     // For maxpooling, tensor_in should have 4 dimensions.
687     OP_REQUIRES(context, tensor_in.dims() == 4,
688                 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
689     OP_REQUIRES(context, tensor_out.dims() == 4,
690                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
691     // For maxpooling, out_grad_backprop should have 4 dimensions.
692     OP_REQUIRES(
693         context, out_grad_backprop.dims() == 4,
694         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
695 
696     Tensor* output = nullptr;
697     OP_REQUIRES_OK(context,
698                    context->allocate_output(0, tensor_out.shape(), &output));
699 
700     std::vector<int32> ksize = ksize_;
701     std::vector<int32> stride = stride_;
702     if (context->num_inputs() == 5) {
703       const Tensor& tensor_ksize = context->input(3);
704       auto value_ksize = tensor_ksize.flat<int32>();
705       ksize.resize(tensor_ksize.shape().num_elements());
706       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
707 
708       const Tensor& tensor_stride = context->input(4);
709       auto value_stride = tensor_stride.flat<int32>();
710       stride.resize(tensor_stride.shape().num_elements());
711       std::copy_n(&value_stride(0), stride.size(), stride.begin());
712     }
713 
714     OP_REQUIRES(context, ksize.size() == 4,
715                 errors::InvalidArgument("Sliding window ksize field must "
716                                         "specify 4 dimensions"));
717     OP_REQUIRES(context, stride.size() == 4,
718                 errors::InvalidArgument("Sliding window strides field must "
719                                         "specify 4 dimensions"));
720     const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
721     const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
722     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
723                 errors::Unimplemented(
724                     "Pooling is not yet supported on the batch dimension."));
725 
726     PoolParameters params{context,  ksize,        stride,
727                           padding_, data_format_, tensor_in.shape()};
728 
729     functor::MaxPoolGradBackwardNoMask<T>()(
730         data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
731         params.tensor_in_batch, params.out_height, params.out_width,
732         params.depth, params.tensor_in_rows, params.tensor_in_cols,
733         params.window_rows, params.window_cols, params.row_stride,
734         params.col_stride, params.pad_rows, params.pad_cols,
735         out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
736         context->eigen_device<Eigen::GpuDevice>());
737   }
738 
739  private:
740   std::vector<int32> ksize_;
741   std::vector<int32> stride_;
742   Padding padding_;
743   TensorFormat data_format_;
744   bool use_dnn_;
745 };
746 
747 #endif  // GOOGLE_CUDA
748 
749 template <typename Device, typename T>
750 struct LaunchMaxPoolingNoMask;
751 
752 template <typename Device, typename T>
753 class MaxPoolingNoMaskOp : public OpKernel {
754  public:
MaxPoolingNoMaskOp(OpKernelConstruction * context)755   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
756       : OpKernel(context) {
757     string data_format;
758     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
759     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
760                 errors::InvalidArgument("Invalid data format"));
761     OP_REQUIRES(
762         context, data_format_ == FORMAT_NHWC,
763         errors::InvalidArgument(
764             "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
765             DeviceTypeString(context->device_type())));
766     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
767     OP_REQUIRES(context, ksize_.size() == 4,
768                 errors::InvalidArgument("Sliding window ksize field must "
769                                         "specify 4 dimensions"));
770     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
771     OP_REQUIRES(context, stride_.size() == 4,
772                 errors::InvalidArgument("Sliding window stride field must "
773                                         "specify 4 dimensions"));
774     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
775     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
776                 errors::Unimplemented(
777                     "Pooling is not yet supported on the batch dimension."));
778   }
779 
Compute(OpKernelContext * context)780   void Compute(OpKernelContext* context) override {
781     const Tensor& tensor_in = context->input(0);
782 
783     PoolParameters params{context,  ksize_,       stride_,
784                           padding_, data_format_, tensor_in.shape()};
785     if (!context->status().ok()) {
786       return;
787     }
788 
789     TensorShape out_shape({params.tensor_in_batch, params.out_height,
790                            params.out_width, params.depth});
791     Tensor* output = nullptr;
792     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
793 
794     LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
795                                               output);
796   }
797 
798  private:
799   std::vector<int32> ksize_;
800   std::vector<int32> stride_;
801   Padding padding_;
802   TensorFormat data_format_;
803 };
804 
805 template <typename Device, typename T>
806 class MaxPoolingNoMaskV2Op : public OpKernel {
807  public:
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)808   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
809       : OpKernel(context) {
810     string data_format;
811     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
812     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
813                 errors::InvalidArgument("Invalid data format"));
814     OP_REQUIRES(
815         context, data_format_ == FORMAT_NHWC,
816         errors::InvalidArgument(
817             "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
818             DeviceTypeString(context->device_type())));
819     if (context->num_inputs() == 1) {
820       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
821       OP_REQUIRES(context, ksize_.size() == 4,
822                   errors::InvalidArgument("Sliding window ksize field must "
823                                           "specify 4 dimensions"));
824       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
825       OP_REQUIRES(context, stride_.size() == 4,
826                   errors::InvalidArgument("Sliding window stride field must "
827                                           "specify 4 dimensions"));
828       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
829                   errors::Unimplemented(
830                       "Pooling is not yet supported on the batch dimension."));
831     }
832     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
833   }
834 
Compute(OpKernelContext * context)835   void Compute(OpKernelContext* context) override {
836     const Tensor& tensor_in = context->input(0);
837 
838     std::vector<int32> ksize = ksize_;
839     std::vector<int32> stride = stride_;
840 
841     if (context->num_inputs() != 1) {
842       const Tensor& tensor_ksize = context->input(1);
843       auto value_ksize = tensor_ksize.flat<int32>();
844       ksize.resize(tensor_ksize.shape().num_elements());
845       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
846 
847       const Tensor& tensor_stride = context->input(2);
848       auto value_stride = tensor_stride.flat<int32>();
849       stride.resize(tensor_stride.shape().num_elements());
850       std::copy_n(&value_stride(0), stride.size(), stride.begin());
851     }
852     OP_REQUIRES(context, ksize.size() == 4,
853                 errors::InvalidArgument("Sliding window ksize field must "
854                                         "specify 4 dimensions"));
855     OP_REQUIRES(context, stride.size() == 4,
856                 errors::InvalidArgument("Sliding window stride field must "
857                                         "specify 4 dimensions"));
858     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
859                 errors::Unimplemented(
860                     "Pooling is not yet supported on the batch dimension."));
861     PoolParameters params{context,  ksize,        stride,
862                           padding_, data_format_, tensor_in.shape()};
863     if (!context->status().ok()) {
864       return;
865     }
866 
867     TensorShape out_shape({params.tensor_in_batch, params.out_height,
868                            params.out_width, params.depth});
869     Tensor* output = nullptr;
870     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
871 
872     LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
873                                               output);
874   }
875 
876  private:
877   std::vector<int32> ksize_;
878   std::vector<int32> stride_;
879   Padding padding_;
880   TensorFormat data_format_;
881 };
882 
883 template <typename Device, typename T>
884 struct LaunchMaxPoolingWithArgmax;
885 
886 template <typename T>
887 struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
launchtensorflow::LaunchMaxPoolingWithArgmax888   static void launch(OpKernelContext* context, const PoolParameters& params,
889                      const Tensor& input, Tensor* output, Tensor* argmax,
890                      bool propagate_nans, bool include_batch_in_index) {
891     Tensor unused;
892     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(context, output, argmax,
893                                                  nullptr, input, unused, params,
894                                                  include_batch_in_index);
895   }
896 };
897 
898 template <typename Device, typename T>
899 class MaxPoolingWithArgmaxOp : public OpKernel {
900  public:
MaxPoolingWithArgmaxOp(OpKernelConstruction * context)901   explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
902       : OpKernel(context) {
903     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
904     OP_REQUIRES(context, ksize_.size() == 4,
905                 errors::InvalidArgument("Sliding window ksize field must "
906                                         "specify 4 dimensions"));
907     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
908     OP_REQUIRES(context, stride_.size() == 4,
909                 errors::InvalidArgument("Sliding window stride field must "
910                                         "specify 4 dimensions"));
911     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
912     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
913                 errors::Unimplemented(
914                     "Pooling is not yet supported on the batch dimension."));
915     OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
916                                              &include_batch_in_index_));
917     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
918                                    &propagate_nans_));
919   }
920 
Compute(OpKernelContext * context)921   void Compute(OpKernelContext* context) override {
922     const Tensor& tensor_in = context->input(0);
923 
924     PoolParameters params{context,  ksize_,      stride_,
925                           padding_, FORMAT_NHWC, tensor_in.shape()};
926     if (!context->status().ok()) {
927       return;
928     }
929 
930     TensorShape out_shape({params.tensor_in_batch, params.out_height,
931                            params.out_width, params.depth});
932     Tensor* output = nullptr;
933     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
934     Tensor* argmax = nullptr;
935     OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
936 
937     LaunchMaxPoolingWithArgmax<Device, T>::launch(
938         context, params, tensor_in, output, argmax, propagate_nans_,
939         include_batch_in_index_);
940   }
941 
942  private:
943   std::vector<int32> ksize_;
944   std::vector<int32> stride_;
945   Padding padding_;
946   bool propagate_nans_;
947   bool include_batch_in_index_;
948 };
949 
950 template <typename Device, typename T>
951 struct LaunchMaxPoolingGradWithArgmax;
952 
953 template <typename T>
954 struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
955   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
956       EigenMatrixMap;
957 
launchtensorflow::LaunchMaxPoolingGradWithArgmax958   static void launch(OpKernelContext* context, const PoolParameters& params,
959                      const Tensor& grad_in, const Tensor& argmax,
960                      Tensor* grad_out, const bool include_batch_in_index) {
961     const DeviceBase::CpuWorkerThreads& worker_threads =
962         *(context->device()->tensorflow_cpu_worker_threads());
963 
964     auto shard = [&grad_in, &argmax, &grad_out, include_batch_in_index](
965                      int64 start, int64 limit) {
966       const int64 batch_size =
967           GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
968       const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
969       const int64 input_size_per_batch = grad_in.NumElements() / batch_size;
970 
971       {
972         auto grad_out_flat = grad_out->flat<T>();
973         auto argmax_flat = argmax.flat<int64>();
974         auto grad_in_flat = grad_in.flat<T>();
975 
976         const int64 output_start = start * output_size_per_batch;
977         const int64 output_end = limit * output_size_per_batch;
978         EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
979                                   output_end - output_start);
980         inputShard.setConstant(T(0));
981 
982         const int input_start = start * input_size_per_batch;
983         const int input_end = limit * input_size_per_batch;
984         for (int64 index = input_start; index < input_end; index++) {
985           int64 grad_out_index = argmax_flat(index);
986           if (!include_batch_in_index) {
987             const int64 cur_batch = index / input_size_per_batch;
988             grad_out_index += cur_batch * output_size_per_batch;
989           }
990           CHECK(grad_out_index >= output_start && grad_out_index < output_end)
991               << "Invalid output gradient index: " << grad_out_index << ", "
992               << output_start << ", " << output_end;
993           grad_out_flat(grad_out_index) += grad_in_flat(index);
994         }
995       }
996     };
997 
998     const int64 batch_size = GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
999     const int64 shard_cost = grad_out->NumElements() / batch_size;
1000     Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
1001           shard_cost, shard);
1002   }
1003 };
1004 
1005 template <typename Device, typename T>
1006 class MaxPoolingGradWithArgmaxOp : public OpKernel {
1007  public:
MaxPoolingGradWithArgmaxOp(OpKernelConstruction * context)1008   explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
1009       : OpKernel(context) {
1010     string data_format_str;
1011     auto status = context->GetAttr("data_format", &data_format_str);
1012     if (status.ok()) {
1013       OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
1014                   errors::InvalidArgument("Invalid data format"));
1015     }
1016 
1017     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1018     OP_REQUIRES(context, ksize_.size() == 4,
1019                 errors::InvalidArgument("Sliding window ksize field must "
1020                                         "specify 4 dimensions"));
1021     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1022     OP_REQUIRES(context, stride_.size() == 4,
1023                 errors::InvalidArgument("Sliding window stride field must "
1024                                         "specify 4 dimensions"));
1025     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1026     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1027                 errors::Unimplemented(
1028                     "Pooling is not yet supported on the batch dimension."));
1029     OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1030                                              &include_batch_in_index_));
1031   }
1032 
Compute(OpKernelContext * context)1033   void Compute(OpKernelContext* context) override {
1034     const Tensor& tensor_in = context->input(0);
1035     const Tensor& grad_in = context->input(1);
1036     const Tensor& argmax = context->input(2);
1037 
1038     PoolParameters params{context,  ksize_,      stride_,
1039                           padding_, FORMAT_NHWC, tensor_in.shape()};
1040     if (!context->status().ok()) {
1041       return;
1042     }
1043 
1044     TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
1045                            params.tensor_in_cols, params.depth});
1046     Tensor* grad_out = nullptr;
1047     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1048                                 {0}, 0, out_shape, &grad_out));
1049 
1050     LaunchMaxPoolingGradWithArgmax<Device, T>::launch(
1051         context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1052   }
1053 
1054  private:
1055   std::vector<int32> ksize_;
1056   std::vector<int32> stride_;
1057   Padding padding_;
1058   TensorFormat data_format_;
1059   bool include_batch_in_index_;
1060 };
1061 
1062 template <typename Device, typename T>
1063 struct LaunchMaxPoolingGradGradWithArgmax;
1064 
1065 template <typename Device, typename T>
1066 class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
1067  public:
MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction * context)1068   explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
1069       : OpKernel(context) {
1070     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1071     OP_REQUIRES(context, ksize_.size() == 4,
1072                 errors::InvalidArgument("Sliding window ksize field must "
1073                                         "specify 4 dimensions"));
1074     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1075     OP_REQUIRES(context, stride_.size() == 4,
1076                 errors::InvalidArgument("Sliding window stride field must "
1077                                         "specify 4 dimensions"));
1078     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1079     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1080                 errors::Unimplemented(
1081                     "Pooling is not yet supported on the batch dimension."));
1082     OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1083                                              &include_batch_in_index_));
1084   }
1085 
Compute(OpKernelContext * context)1086   void Compute(OpKernelContext* context) override {
1087     const Tensor& tensor_in = context->input(0);
1088     const Tensor& grad_in = context->input(1);
1089     const Tensor& argmax = context->input(2);
1090 
1091     PoolParameters params{context,  ksize_,      stride_,
1092                           padding_, FORMAT_NHWC, tensor_in.shape()};
1093     if (!context->status().ok()) {
1094       return;
1095     }
1096 
1097     TensorShape out_shape({params.tensor_in_batch, params.out_height,
1098                            params.out_width, params.depth});
1099 
1100     Tensor* grad_out = nullptr;
1101     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1102                                 {0}, 0, out_shape, &grad_out));
1103 
1104     LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
1105         context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1106   }
1107 
1108  private:
1109   std::vector<int32> ksize_;
1110   std::vector<int32> stride_;
1111   Padding padding_;
1112   bool include_batch_in_index_;
1113 };
1114 
1115 #if GOOGLE_CUDA
1116 template <typename T>
1117 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
1118  public:
1119   typedef GPUDevice Device;
MaxPoolingNoMaskOp(OpKernelConstruction * context)1120   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
1121       : OpKernel(context) {
1122     string data_format;
1123     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1124     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1125                 errors::InvalidArgument("Invalid data format"));
1126     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1127     OP_REQUIRES(context, ksize_.size() == 4,
1128                 errors::InvalidArgument("Sliding window ksize field must "
1129                                         "specify 4 dimensions"));
1130     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1131     OP_REQUIRES(context, stride_.size() == 4,
1132                 errors::InvalidArgument("Sliding window stride field must "
1133                                         "specify 4 dimensions"));
1134     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1135     const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1136     const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
1137     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1138                 errors::Unimplemented(
1139                     "Pooling is not yet supported on the batch dimension."));
1140     use_dnn_ = CanUseCudnn();
1141 
1142     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1143                                    &propagate_nans_));
1144   }
1145 
Compute(OpKernelContext * context)1146   void Compute(OpKernelContext* context) override {
1147     const Tensor& tensor_in = context->input(0);
1148 
1149     PoolParameters params{context,  ksize_,       stride_,
1150                           padding_, data_format_, tensor_in.shape()};
1151     if (!context->status().ok()) {
1152       return;
1153     }
1154 
1155     TensorShape out_shape =
1156         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1157                         params.out_width, params.depth);
1158 
1159     // Assuming qint8 <--> NCHW_VECT_C (int8x4) here.
1160     constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
1161     OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
1162                 errors::InvalidArgument(
1163                     "qint8 should be used with data_format NCHW_VECT_C."));
1164 
1165 #if CUDNN_VERSION >= 7300
1166     if (use_dnn_) {
1167       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1168                                stride_, padding_, data_format_, tensor_in,
1169                                out_shape, propagate_nans_);
1170 #else
1171     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
1172     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
1173       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1174                                stride_, padding_, data_format_, tensor_in,
1175                                out_shape, propagate_nans_);
1176 #endif
1177     } else {
1178       Tensor* output = nullptr;
1179       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1180       if (is_int8x4) {
1181         LaunchMaxPoolingNoMask_NCHW_VECT_C<Device>::launch(context, params,
1182                                                            tensor_in, output);
1183       } else if (data_format_ == FORMAT_NHWC) {
1184         LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1185                                                   output, propagate_nans_);
1186       } else {
1187         LOG(FATAL) << "MaxPool currently only supports the following (layout, "
1188                       "type) combinations: (NHWC, non-qint8), "
1189                       "(NCHW, non-qint8) or (NCHW_VECT_C, qint8). The "
1190                       "requested combination ("
1191                    << ToString(data_format_) << ", "
1192                    << DataTypeString(DataTypeToEnum<T>::v())
1193                    << ") is not supported.";
1194       }
1195     }
1196   }
1197 
1198  private:
1199   std::vector<int32> ksize_;
1200   std::vector<int32> stride_;
1201   Padding padding_;
1202   TensorFormat data_format_;
1203   bool use_dnn_;
1204   bool propagate_nans_;
1205 };
1206 
1207 template <typename T>
1208 class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
1209  public:
1210   typedef GPUDevice Device;
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)1211   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
1212       : OpKernel(context) {
1213     string data_format;
1214     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1215     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1216                 errors::InvalidArgument("Invalid data format"));
1217     if (context->num_inputs() == 1) {
1218       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1219       OP_REQUIRES(context, ksize_.size() == 4,
1220                   errors::InvalidArgument("Sliding window ksize field must "
1221                                           "specify 4 dimensions"));
1222       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1223       OP_REQUIRES(context, stride_.size() == 4,
1224                   errors::InvalidArgument("Sliding window stride field must "
1225                                           "specify 4 dimensions"));
1226       const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1227       const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
1228       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1229                   errors::Unimplemented(
1230                       "Pooling is not yet supported on the batch dimension."));
1231     }
1232     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1233     use_dnn_ = CanUseCudnn();
1234     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1235                                    &propagate_nans_));
1236   }
1237 
Compute(OpKernelContext * context)1238   void Compute(OpKernelContext* context) override {
1239     const Tensor& tensor_in = context->input(0);
1240 
1241     std::vector<int32> ksize = ksize_;
1242     std::vector<int32> stride = stride_;
1243 
1244     if (context->num_inputs() != 1) {
1245       const Tensor& tensor_ksize = context->input(1);
1246       auto value_ksize = tensor_ksize.flat<int32>();
1247       ksize.resize(tensor_ksize.shape().num_elements());
1248       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
1249 
1250       const Tensor& tensor_stride = context->input(2);
1251       auto value_stride = tensor_stride.flat<int32>();
1252       stride.resize(tensor_stride.shape().num_elements());
1253       std::copy_n(&value_stride(0), stride.size(), stride.begin());
1254     }
1255     OP_REQUIRES(context, ksize.size() == 4,
1256                 errors::InvalidArgument("Sliding window ksize field must "
1257                                         "specify 4 dimensions"));
1258     OP_REQUIRES(context, stride.size() == 4,
1259                 errors::InvalidArgument("Sliding window stride field must "
1260                                         "specify 4 dimensions"));
1261     const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
1262     const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
1263     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1264                 errors::Unimplemented(
1265                     "Pooling is not yet supported on the batch dimension."));
1266 
1267     PoolParameters params{context,  ksize,        stride,
1268                           padding_, data_format_, tensor_in.shape()};
1269     if (!context->status().ok()) {
1270       return;
1271     }
1272 
1273     TensorShape out_shape =
1274         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1275                         params.out_width, params.depth);
1276     if (use_dnn_ && data_format_ == FORMAT_NCHW) {
1277       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
1278                                stride, padding_, data_format_, tensor_in,
1279                                out_shape, propagate_nans_);
1280     } else {
1281       CHECK(data_format_ == FORMAT_NHWC)
1282           << "Non-Cudnn MaxPool only supports NHWC format";
1283       Tensor* output = nullptr;
1284       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1285       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1286                                                 output, propagate_nans_);
1287     }
1288   }
1289 
1290  private:
1291   std::vector<int32> ksize_;
1292   std::vector<int32> stride_;
1293   Padding padding_;
1294   TensorFormat data_format_;
1295   bool use_dnn_;
1296   bool propagate_nans_;
1297 };
1298 
1299 template <typename T>
1300 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingNoMask1301   static void launch(OpKernelContext* context, const PoolParameters& params,
1302                      const Tensor& input, Tensor* output, bool propagate_nans) {
1303     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1304         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1305         params.tensor_in_cols, params.depth, params.out_height,
1306         params.out_width, params.window_rows, params.window_cols,
1307         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
1308         output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
1309         propagate_nans, false);
1310     if (!status) {
1311       context->SetStatus(
1312           errors::Internal("Failed launching MaxPoolForwardNoMask"));
1313     }
1314   }
1315 };
1316 
1317 template <typename T>
1318 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingWithArgmax1319   static void launch(OpKernelContext* context, const PoolParameters& params,
1320                      const Tensor& input, Tensor* output, Tensor* argmax,
1321                      bool propagate_nans, bool include_batch_in_index) {
1322     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1323         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1324         params.tensor_in_cols, params.depth, params.out_height,
1325         params.out_width, params.window_rows, params.window_cols,
1326         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
1327         output->flat<T>().data(),
1328         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
1329         context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
1330     if (!status) {
1331       context->SetStatus(
1332           errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
1333     }
1334   }
1335 };
1336 
1337 template <typename T>
1338 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradWithArgmax1339   static void launch(OpKernelContext* context, const PoolParameters& params,
1340                      const Tensor& grad_in, const Tensor& argmax,
1341                      Tensor* grad_out, const bool include_batch_in_index) {
1342     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1343                            params.tensor_in_cols * params.depth;
1344     const int output_size = params.tensor_in_batch * params.out_height *
1345                             params.out_width * params.depth;
1346     const int top_offset = params.out_height * params.out_width * params.depth;
1347     const int bottom_offset =
1348         params.tensor_in_rows * params.tensor_in_cols * params.depth;
1349     bool status = functor::MaxPoolBackwardWithArgmax<T>()(
1350         output_size, input_size, grad_in.flat<T>().data(),
1351         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1352         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1353         include_batch_in_index);
1354     if (!status) {
1355       context->SetStatus(
1356           errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
1357     }
1358   }
1359 };
1360 
1361 template <typename T>
1362 struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradGradWithArgmax1363   static void launch(OpKernelContext* context, const PoolParameters& params,
1364                      const Tensor& grad_in, const Tensor& argmax,
1365                      Tensor* grad_out, const bool include_batch_in_index) {
1366     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1367                            params.tensor_in_cols * params.depth;
1368     const int output_size = params.tensor_in_batch * params.out_height *
1369                             params.out_width * params.depth;
1370     const int top_offset =
1371         params.tensor_in_rows * params.tensor_in_cols * params.depth;
1372     const int bottom_offset =
1373         params.out_width * params.out_height * params.depth;
1374     bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
1375         output_size, input_size, grad_in.flat<T>().data(),
1376         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1377         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1378         include_batch_in_index);
1379     if (!status) {
1380       context->SetStatus(
1381           errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
1382     }
1383   }
1384 };
1385 
1386 #endif  // GOOGLE_CUDA
1387 
1388 #define REGISTER_MAX_POOL_KERNELS(D, T)                                  \
1389   REGISTER_KERNEL_BUILDER(                                               \
1390       Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"),     \
1391       MaxPoolingGradOp<D##Device, T>);                                   \
1392   REGISTER_KERNEL_BUILDER(                                               \
1393       Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
1394       MaxPoolingGradGradOp<D##Device, T>);                               \
1395   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2")                          \
1396                               .Device(DEVICE_##D)                        \
1397                               .HostMemory("ksize")                       \
1398                               .HostMemory("strides")                     \
1399                               .TypeConstraint<T>("T"),                   \
1400                           MaxPoolingGradOp<D##Device, T>);               \
1401   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2")                      \
1402                               .Device(DEVICE_##D)                        \
1403                               .HostMemory("ksize")                       \
1404                               .HostMemory("strides")                     \
1405                               .TypeConstraint<T>("T"),                   \
1406                           MaxPoolingGradGradOp<D##Device, T>)            \
1407   REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                      \
1408                               .Device(DEVICE_##D)                        \
1409                               .TypeConstraint<int64>("Targmax")          \
1410                               .TypeConstraint<T>("T"),                   \
1411                           MaxPoolingWithArgmaxOp<D##Device, T>);         \
1412   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")                  \
1413                               .Device(DEVICE_##D)                        \
1414                               .TypeConstraint<T>("T")                    \
1415                               .TypeConstraint<int64>("Targmax"),         \
1416                           MaxPoolingGradWithArgmaxOp<D##Device, T>);
1417 
1418 // Below kernels implemented only for CPU device.
1419 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
1420   REGISTER_KERNEL_BUILDER(                                         \
1421       Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
1422       MaxPoolingOp<CPUDevice, T>);                                 \
1423   REGISTER_KERNEL_BUILDER(                                         \
1424       Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
1425       MaxPoolingV2Op<CPUDevice, T>);
1426 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
1427 #undef REGISTER_CPU_ONLY_POOL_KERNELS
1428 
1429 #define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
1430 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
1431 #undef REGISTER_CPU_KERNELS
1432 
1433 #if GOOGLE_CUDA
1434 
1435 // Forward declarations for the functor specializations for GPU.
1436 namespace functor {
1437 #define DECLARE_GPU_SPEC(T)                                            \
1438   template <>                                                          \
1439   void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
1440       const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
1441       typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
1442       int window_cols, int row_stride, int col_stride,                 \
1443       const Eigen::PaddingType& padding);                              \
1444   extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
1445 
1446 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
1447 #undef DECLARE_GPU_SPEC
1448 }  // namespace functor
1449 
1450 #define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
1451 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
1452 #undef REGISTER_GPU_MAX_POOL_KERNELS
1453 
1454 // Below kernels currently implemented only for GPU device.
1455 // Note(jiayq): Currently, the Caffe custom implementation is faster than the
1456 // default Eigen implementation so we are using the custom kernel as the
1457 // default. However, you can explicitly invoke the eigen version using
1458 // kernel_label_map.
1459 #define REGISTER_GPU_ONLY_POOL_KERNELS(T)                        \
1460   REGISTER_KERNEL_BUILDER(Name("MaxPool")                        \
1461                               .Device(DEVICE_GPU)                \
1462                               .TypeConstraint<T>("T")            \
1463                               .Label("eigen_tensor"),            \
1464                           MaxPoolingOp<GPUDevice, T>);           \
1465   REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
1466                               .Device(DEVICE_GPU)                \
1467                               .HostMemory("ksize")               \
1468                               .HostMemory("strides")             \
1469                               .TypeConstraint<T>("T")            \
1470                               .Label("eigen_tensor"),            \
1471                           MaxPoolingV2Op<GPUDevice, T>);         \
1472   REGISTER_KERNEL_BUILDER(                                       \
1473       Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
1474       MaxPoolingNoMaskOp<GPUDevice, T>);                         \
1475   REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
1476                               .Device(DEVICE_GPU)                \
1477                               .HostMemory("ksize")               \
1478                               .HostMemory("strides")             \
1479                               .TypeConstraint<T>("T"),           \
1480                           MaxPoolingNoMaskV2Op<GPUDevice, T>);   \
1481   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")      \
1482                               .Device(DEVICE_GPU)                \
1483                               .TypeConstraint<T>("T")            \
1484                               .TypeConstraint<int64>("Targmax"), \
1485                           MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
1486 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
1487 
1488 // TODO(b/65847473): Re-enable once the underlying build error is fixed.
1489 #if !defined(PLATFORM_WINDOWS)
1490 REGISTER_KERNEL_BUILDER(
1491     Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
1492     MaxPoolingNoMaskOp<GPUDevice, qint8>);
1493 
1494 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1495                             .Device(DEVICE_GPU)
1496                             .HostMemory("ksize")
1497                             .HostMemory("strides")
1498                             .TypeConstraint<qint8>("T"),
1499                         MaxPoolingV2Op<GPUDevice, qint8>);
1500 
1501 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1502                             .Device(DEVICE_GPU)
1503                             .HostMemory("ksize")
1504                             .HostMemory("strides")
1505                             .TypeConstraint<qint8>("T")
1506                             .Label("eigen_tensor"),
1507                         MaxPoolingV2Op<GPUDevice, qint8>);
1508 #endif  // !defined(PLATFORM_WINDOWS)
1509 
1510 #undef REGISTER_GPU_ONLY_POOL_KERNELS
1511 
1512 #endif  // GOOGLE_CUDA
1513 
1514 #undef REGISTER_MAX_POOL_KERNELS
1515 
1516 }  // namespace tensorflow
1517