1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // See docs in ../ops/nn_ops.cc.
17
18 #define EIGEN_USE_THREADS
19
20 #include "tensorflow/core/kernels/maxpooling_op.h"
21
22 #include <type_traits>
23 #include <vector>
24
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/common_runtime/device.h"
27 #include "tensorflow/core/framework/bounds_check.h"
28 #include "tensorflow/core/framework/numeric_op.h"
29 #include "tensorflow/core/framework/op_kernel.h"
30 #include "tensorflow/core/framework/register_types.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_shape.h"
33 #include "tensorflow/core/framework/tensor_slice.h"
34 #include "tensorflow/core/kernels/conv_2d.h"
35 #include "tensorflow/core/kernels/eigen_pooling.h"
36 #include "tensorflow/core/kernels/ops_util.h"
37 #include "tensorflow/core/kernels/pooling_ops_common.h"
38 #include "tensorflow/core/lib/core/errors.h"
39 #include "tensorflow/core/lib/gtl/array_slice.h"
40 #include "tensorflow/core/util/determinism.h"
41 #include "tensorflow/core/util/env_var.h"
42 #include "tensorflow/core/util/padding.h"
43 #include "tensorflow/core/util/tensor_format.h"
44 #include "tensorflow/core/util/use_cudnn.h"
45
46 #if GOOGLE_CUDA
47 #include "third_party/gpus/cudnn/cudnn.h"
48 #endif // GOOGLE_CUDA
49 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
50 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
51 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
52 #include "tensorflow/core/platform/stream_executor.h"
53 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
54
55 namespace tensorflow {
56
57 typedef Eigen::ThreadPoolDevice CPUDevice;
58 typedef Eigen::GpuDevice GPUDevice;
59
60 const int kInvalidMaxPoolingIndex = -1;
61
62 template <typename Device, typename T, typename Targmax>
SpatialMaxPoolWithArgMaxHelper(OpKernelContext * context,Tensor * output,Tensor * output_arg_max,Tensor * input_backprop,const Tensor & tensor_in,const Tensor & out_backprop,const PoolParameters & params,const bool include_batch_in_index)63 static void SpatialMaxPoolWithArgMaxHelper(
64 OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
65 Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
66 const PoolParameters& params, const bool include_batch_in_index) {
67 if (input_backprop != nullptr) {
68 OP_REQUIRES(
69 context, include_batch_in_index,
70 errors::Internal(
71 "SpatialMaxPoolWithArgMaxHelper requires include_batch_in_index "
72 "to be True when input_backprop != nullptr"));
73 OP_REQUIRES(
74 context, (std::is_same<Targmax, int64>::value),
75 errors::Internal("SpatialMaxPoolWithArgMaxHelper requires Targmax "
76 "to be int64 when input_backprop != nullptr"));
77 }
78 if (tensor_in.NumElements() == 0 || output->NumElements() == 0) return;
79
80 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
81 ConstEigenMatrixMap;
82 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
83 EigenMatrixMap;
84 typedef Eigen::Map<Eigen::Matrix<Targmax, Eigen::Dynamic, Eigen::Dynamic>>
85 EigenIndexMatrixMap;
86
87 ConstEigenMatrixMap in_mat(
88 tensor_in.flat<T>().data(), params.depth,
89 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
90 EigenMatrixMap out_mat(
91 output->flat<T>().data(), params.depth,
92 params.out_width * params.out_height * params.tensor_in_batch);
93 EigenIndexMatrixMap out_arg_max_mat(
94 output_arg_max->flat<Targmax>().data(), params.depth,
95 params.out_width * params.out_height * params.tensor_in_batch);
96
97 const DeviceBase::CpuWorkerThreads& worker_threads =
98 *(context->device()->tensorflow_cpu_worker_threads());
99
100 // The following code basically does the following:
101 // 1. Flattens the input and output tensors into two dimensional arrays.
102 // tensor_in_as_matrix:
103 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
104 // output_as_matrix:
105 // depth by (out_width * out_height * tensor_in_batch)
106 //
107 // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
108 // and updates the corresponding column(s) in output_as_matrix with the
109 // max value.
110 auto shard = [¶ms, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
111 &output_arg_max, &out_backprop,
112 include_batch_in_index](int64_t start, int64_t limit) {
113 const int32_t depth = params.depth;
114 const int32_t in_rows = params.tensor_in_rows;
115 const int32_t in_cols = params.tensor_in_cols;
116 const int32_t pad_top = params.pad_top;
117 const int32_t pad_left = params.pad_left;
118 const int32_t window_rows = params.window_rows;
119 const int32_t window_cols = params.window_cols;
120 const int32_t row_stride = params.row_stride;
121 const int32_t col_stride = params.col_stride;
122 const int32_t out_height = params.out_height;
123 const int32_t out_width = params.out_width;
124
125 {
126 // Initializes the output tensor with MIN<T>.
127 const int32_t output_image_size = out_height * out_width * depth;
128 EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1,
129 (limit - start) * output_image_size);
130 out_shard.setConstant(Eigen::NumTraits<T>::lowest());
131 EigenIndexMatrixMap out_arg_max_shard(
132 out_arg_max_mat.data() + start * output_image_size, 1,
133 (limit - start) * output_image_size);
134 out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
135 }
136
137 for (int64_t b = start; b < limit; ++b) {
138 for (int h = 0; h < in_rows; ++h) {
139 for (int w = 0; w < in_cols; ++w) {
140 // (h_start, h_end) * (w_start, w_end) is the range that the input
141 // vector projects to.
142 const int hpad = h + pad_top;
143 const int wpad = w + pad_left;
144 const int h_start =
145 (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
146 const int h_end = std::min(hpad / row_stride + 1, out_height);
147 const int w_start =
148 (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
149 const int w_end = std::min(wpad / col_stride + 1, out_width);
150 // compute elementwise max
151 const int64_t in_index = (b * in_rows + h) * in_cols + w;
152 for (int ph = h_start; ph < h_end; ++ph) {
153 const int64_t out_index_base = (b * out_height + ph) * out_width;
154 for (int pw = w_start; pw < w_end; ++pw) {
155 const int64_t out_index = out_index_base + pw;
156 /// NOTES(zhengxq): not using the eigen matrix operation for
157 /// now.
158 for (int d = 0; d < depth; ++d) {
159 const T& input_ref = in_mat.coeffRef(d, in_index);
160 T& output_ref = out_mat.coeffRef(d, out_index);
161 Targmax& out_arg_max_ref =
162 out_arg_max_mat.coeffRef(d, out_index);
163 if (output_ref < input_ref ||
164 out_arg_max_ref == kInvalidMaxPoolingIndex) {
165 output_ref = input_ref;
166 if (include_batch_in_index) {
167 out_arg_max_ref = in_index * depth + d;
168 } else {
169 out_arg_max_ref = (h * in_cols + w) * depth + d;
170 }
171 }
172 }
173 }
174 }
175 }
176 }
177 }
178
179 if (input_backprop != nullptr) {
180 auto input_backprop_flat = input_backprop->flat<T>();
181 auto out_arg_max_flat = output_arg_max->flat<int64>();
182 auto out_backprop_flat = out_backprop.flat<T>();
183
184 // Initialize output to 0.
185 const int64_t in_size = in_rows * in_cols * depth;
186 const int64_t in_start = start * in_size;
187 const int64_t in_end = limit * in_size;
188 EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
189 in_end - in_start);
190 in_shard.setConstant(T(0));
191
192 // Backpropagate.
193 const int out_size = out_height * out_width * depth;
194 const int out_start = start * out_size;
195 const int out_end = limit * out_size;
196 for (int index = out_start; index < out_end; ++index) {
197 int input_backprop_index = out_arg_max_flat(index);
198 // Although this check is in the inner loop, it is worth its value
199 // so we don't end up with memory corruptions. Our benchmark shows that
200 // the performance impact is quite small
201 // CHECK(input_backprop_index >= in_start && input_backprop_index <
202 // in_end)
203 FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
204 if (index < out_backprop.NumElements()) {
205 input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
206 }
207 }
208 }
209 };
210
211 const int64_t shard_cost = params.tensor_in_rows * params.tensor_in_cols *
212 params.depth * params.window_rows *
213 params.window_cols;
214 Shard(worker_threads.num_threads, worker_threads.workers,
215 params.tensor_in_batch, shard_cost, shard);
216 }
217
218 // The operation to compute MaxPool gradients.
219 // It takes three inputs:
220 // - The original input tensor
221 // - The original output tensor
222 // - Backprop tensor for output
223 // It produces one output: backprop tensor for input.
224 template <class Device, class T>
225 class MaxPoolingGradOp : public OpKernel {
226 public:
MaxPoolingGradOp(OpKernelConstruction * context)227 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
228 string data_format;
229 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
230 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
231 errors::InvalidArgument("Invalid data format"));
232 OP_REQUIRES(
233 context, data_format_ == FORMAT_NHWC,
234 errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
235 "on device type ",
236 DeviceTypeString(context->device_type())));
237
238 if (context->num_inputs() == 3) {
239 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
240 OP_REQUIRES(context, ksize_.size() == 4,
241 errors::InvalidArgument("Sliding window ksize field must "
242 "specify 4 dimensions"));
243 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
244 OP_REQUIRES(context, stride_.size() == 4,
245 errors::InvalidArgument("Sliding window strides field must "
246 "specify 4 dimensions"));
247 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
248 errors::Unimplemented(
249 "Pooling is not yet supported on the batch dimension."));
250 OP_REQUIRES(
251 context, ksize_[3] == 1 && stride_[3] == 1,
252 errors::Unimplemented(
253 "MaxPoolingGrad is not yet supported on the depth dimension."));
254 }
255
256 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
257
258 if (padding_ == Padding::EXPLICIT) {
259 OP_REQUIRES_OK(
260 context, context->GetAttr("explicit_paddings", &explicit_paddings_));
261 OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
262 /*num_dims=*/4, data_format_));
263 }
264 }
265
Compute(OpKernelContext * context)266 void Compute(OpKernelContext* context) override {
267 const Tensor& tensor_in = context->input(0);
268 const Tensor& tensor_out = context->input(1);
269 const Tensor& out_backprop = context->input(2);
270
271 // For maxpooling, tensor_in should have 4 dimensions.
272 OP_REQUIRES(context, tensor_in.dims() == 4,
273 errors::InvalidArgument("tensor_in must be 4-dimensional"));
274 OP_REQUIRES(context, tensor_out.dims() == 4,
275 errors::InvalidArgument("tensor_out must be 4-dimensional"));
276 // For maxpooling, out_backprop should have 4 dimensions.
277 OP_REQUIRES(context, out_backprop.dims() == 4,
278 errors::InvalidArgument("out_backprop must be 4-dimensional"));
279
280 const TensorShape& output_shape = tensor_in.shape();
281
282 Tensor tensor_out_dup;
283 OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
284 {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
285 &tensor_out_dup));
286 Tensor tensor_out_arg_max;
287 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
288 tensor_out.shape(),
289 &tensor_out_arg_max));
290 std::vector<int32> ksize = ksize_;
291 std::vector<int32> stride = stride_;
292 if (context->num_inputs() == 5) {
293 const Tensor& tensor_ksize = context->input(3);
294 auto value_ksize = tensor_ksize.flat<int32>();
295 ksize.resize(tensor_ksize.shape().num_elements());
296 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
297
298 const Tensor& tensor_stride = context->input(4);
299 auto value_stride = tensor_stride.flat<int32>();
300 stride.resize(tensor_stride.shape().num_elements());
301 std::copy_n(&value_stride(0), stride.size(), stride.begin());
302 }
303
304 OP_REQUIRES(context, ksize.size() == 4,
305 errors::InvalidArgument("Sliding window ksize field must "
306 "specify 4 dimensions"));
307 OP_REQUIRES(context, stride.size() == 4,
308 errors::InvalidArgument("Sliding window strides field must "
309 "specify 4 dimensions"));
310 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
311 errors::Unimplemented(
312 "Pooling is not yet supported on the batch dimension."));
313 OP_REQUIRES(
314 context, ksize[3] == 1 && stride[3] == 1,
315 errors::Unimplemented(
316 "MaxPoolingGrad is not yet supported on the depth dimension."));
317
318 PoolParameters params{context,
319 ksize,
320 stride,
321 padding_,
322 explicit_paddings_,
323 FORMAT_NHWC,
324 tensor_in.shape()};
325 if (!context->status().ok()) {
326 return;
327 }
328
329 Tensor* output = nullptr;
330 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
331 {0}, 0, output_shape, &output));
332
333 SpatialMaxPoolWithArgMaxHelper<CPUDevice, T, int64>(
334 context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
335 out_backprop, params, true);
336 }
337
338 private:
339 std::vector<int32> ksize_;
340 std::vector<int32> stride_;
341 Padding padding_;
342 std::vector<int64> explicit_paddings_;
343 TensorFormat data_format_;
344 };
345
346 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
347
348 template <class T>
349 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
350 public:
351 typedef Eigen::GpuDevice Device;
352
MaxPoolingGradOp(OpKernelConstruction * context)353 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
354 string data_format;
355 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
356 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
357 errors::InvalidArgument("Invalid data format"));
358 if (context->num_inputs() == 3) {
359 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
360 OP_REQUIRES(context, ksize_.size() == 4,
361 errors::InvalidArgument("Sliding window ksize field must "
362 "specify 4 dimensions"));
363 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
364 OP_REQUIRES(context, stride_.size() == 4,
365 errors::InvalidArgument("Sliding window strides field must "
366 "specify 4 dimensions"));
367 const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
368 const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
369 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
370 errors::Unimplemented(
371 "Pooling is not yet supported on the batch dimension."));
372 }
373 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
374 if (padding_ == Padding::EXPLICIT) {
375 OP_REQUIRES_OK(
376 context, context->GetAttr("explicit_paddings", &explicit_paddings_));
377 OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
378 /*num_dims=*/4, data_format_));
379 }
380 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
381 &propagate_nans_));
382 }
383
Compute(OpKernelContext * context)384 void Compute(OpKernelContext* context) override {
385 const Tensor& tensor_in = context->input(0);
386 const Tensor& tensor_out = context->input(1);
387 const Tensor& out_backprop = context->input(2);
388
389 // For maxpooling, tensor_in should have 4 dimensions.
390 OP_REQUIRES(context, tensor_in.dims() == 4,
391 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
392 OP_REQUIRES(context, tensor_out.dims() == 4,
393 errors::InvalidArgument("tensor_out must be 4-dimensional"));
394 // For maxpooling, out_backprop should have 4 dimensions.
395 OP_REQUIRES(context, out_backprop.dims() == 4,
396 errors::InvalidArgument("out_backprop must be 4-dimensional"));
397
398 TensorShape output_shape = tensor_in.shape();
399
400 std::vector<int32> ksize = ksize_;
401 std::vector<int32> stride = stride_;
402 if (context->num_inputs() == 5) {
403 const Tensor& tensor_ksize = context->input(3);
404 auto value_ksize = tensor_ksize.flat<int32>();
405 ksize.resize(tensor_ksize.shape().num_elements());
406 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
407
408 const Tensor& tensor_stride = context->input(4);
409 auto value_stride = tensor_stride.flat<int32>();
410 stride.resize(tensor_stride.shape().num_elements());
411 std::copy_n(&value_stride(0), stride.size(), stride.begin());
412 }
413 OP_REQUIRES(context, ksize.size() == 4,
414 errors::InvalidArgument("Sliding window ksize field must "
415 "specify 4 dimensions"));
416 OP_REQUIRES(context, stride.size() == 4,
417 errors::InvalidArgument("Sliding window strides field must "
418 "specify 4 dimensions"));
419 const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N');
420 const int32_t stride_n = GetTensorDim(stride, data_format_, 'N');
421 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
422 errors::Unimplemented(
423 "Pooling is not yet supported on the batch dimension."));
424 int64_t pad_top, pad_bottom, pad_left, pad_right;
425 if (padding_ == Padding::EXPLICIT) {
426 GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H',
427 /*pad_top=*/&pad_top,
428 /*pad_bottom=*/&pad_bottom);
429 GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W',
430 /*pad_left=*/&pad_left,
431 /*pad_right=*/&pad_right);
432 }
433 DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
434 stride, padding_, explicit_paddings_,
435 data_format_, &tensor_in, &tensor_out,
436 out_backprop, output_shape, propagate_nans_);
437 }
438
439 private:
440 std::vector<int32> ksize_;
441 std::vector<int32> stride_;
442 Padding padding_;
443 std::vector<int64> explicit_paddings_;
444 TensorFormat data_format_;
445 bool propagate_nans_;
446 };
447
448 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
449
450 // The operation to compute gradient of MaxPool gradients.
451 // It takes three inputs:
452 // - The original input tensor
453 // - The original output tensor
454 // - Backprop tensor for output gradients
455 // It produces one output: backprop tensor for output gradient.
456 template <class Device, class T>
457 class MaxPoolingGradGradOp : public OpKernel {
458 public:
MaxPoolingGradGradOp(OpKernelConstruction * context)459 explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
460 : OpKernel(context) {
461 string data_format;
462 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
463 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
464 errors::InvalidArgument("Invalid data format"));
465 OP_REQUIRES(
466 context, data_format_ == FORMAT_NHWC,
467 errors::InvalidArgument(
468 "Default MaxPoolingGradGradOp only supports NHWC ",
469 "on device type ", DeviceTypeString(context->device_type())));
470
471 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
472
473 if (context->num_inputs() == 3) {
474 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
475 OP_REQUIRES(context, ksize_.size() == 4,
476 errors::InvalidArgument("Sliding window ksize field must "
477 "specify 4 dimensions"));
478 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
479 OP_REQUIRES(context, stride_.size() == 4,
480 errors::InvalidArgument("Sliding window strides field must "
481 "specify 4 dimensions"));
482 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
483 errors::Unimplemented(
484 "Pooling is not yet supported on the batch dimension."));
485 OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1,
486 errors::Unimplemented("MaxPoolingGradGrad is not yet "
487 "supported on the depth dimension."));
488 }
489 }
490
Compute(OpKernelContext * context)491 void Compute(OpKernelContext* context) override {
492 const Tensor& tensor_in = context->input(0);
493 const Tensor& tensor_out = context->input(1);
494 const Tensor& out_grad_backprop = context->input(2);
495
496 // For maxpooling, tensor_in should have 4 dimensions.
497 OP_REQUIRES(context, tensor_in.dims() == 4,
498 errors::InvalidArgument("tensor_in must be 4-dimensional"));
499 OP_REQUIRES(context, tensor_out.dims() == 4,
500 errors::InvalidArgument("tensor_out must be 4-dimensional"));
501 // For maxpooling, out_grad_backprop should have 4 dimensions.
502 OP_REQUIRES(
503 context, out_grad_backprop.dims() == 4,
504 errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
505
506 std::vector<int32> ksize = ksize_;
507 std::vector<int32> stride = stride_;
508 if (context->num_inputs() == 5) {
509 const Tensor& tensor_ksize = context->input(3);
510 auto value_ksize = tensor_ksize.flat<int32>();
511 ksize.resize(tensor_ksize.shape().num_elements());
512 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
513
514 const Tensor& tensor_stride = context->input(4);
515 auto value_stride = tensor_stride.flat<int32>();
516 stride.resize(tensor_stride.shape().num_elements());
517 std::copy_n(&value_stride(0), stride.size(), stride.begin());
518 }
519
520 OP_REQUIRES(context, ksize.size() == 4,
521 errors::InvalidArgument("Sliding window ksize field must "
522 "specify 4 dimensions"));
523 OP_REQUIRES(context, stride.size() == 4,
524 errors::InvalidArgument("Sliding window strides field must "
525 "specify 4 dimensions"));
526 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
527 errors::Unimplemented(
528 "Pooling is not yet supported on the batch dimension."));
529 OP_REQUIRES(
530 context, ksize[3] == 1 && stride[3] == 1,
531 errors::Unimplemented(
532 "MaxPoolingGrad is not yet supported on the depth dimension."));
533
534 PoolParameters params{context,
535 ksize,
536 stride,
537 padding_,
538 /*explicit_paddings=*/{},
539 FORMAT_NHWC,
540 tensor_in.shape()};
541 Tensor* output = nullptr;
542 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
543 {2}, 0, tensor_out.shape(), &output));
544
545 SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
546 out_grad_backprop, params, padding_);
547 }
548
549 private:
SpatialMaxPoolGradGrad(OpKernelContext * context,Tensor * bottom_diff,const Tensor & tensor_in,const Tensor & tensor_out,const Tensor & top_diff,const PoolParameters & params,const Padding & padding)550 void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
551 const Tensor& tensor_in, const Tensor& tensor_out,
552 const Tensor& top_diff,
553 const PoolParameters& params,
554 const Padding& padding) {
555 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
556 ConstEigenMatrixMap;
557 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
558 EigenMatrixMap;
559
560 ConstEigenMatrixMap in_mat(
561 tensor_in.flat<T>().data(), params.depth,
562 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
563 ConstEigenMatrixMap out_mat(
564 tensor_out.flat<T>().data(), params.depth,
565 params.out_width * params.out_height * params.tensor_in_batch);
566 ConstEigenMatrixMap top_diff_mat(
567 top_diff.flat<T>().data(), params.depth,
568 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
569 EigenMatrixMap bottom_diff_mat(
570 bottom_diff->flat<T>().data(), params.depth,
571 params.out_width * params.out_height * params.tensor_in_batch);
572
573 const DeviceBase::CpuWorkerThreads& worker_threads =
574 *(context->device()->tensorflow_cpu_worker_threads());
575
576 // The following code basically does the following:
577 // 1. Flattens the input, output, top_diff and bottom_diff tensors into
578 // two dimensional arrays.
579 // tensor_in_as_matrix:
580 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
581 // tensor_out_as_matrix:
582 // depth by (out_width * out_height * tensor_in_batch)
583 // top_diff_as_matrix:
584 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
585 // bottom_diff_as_matrix:
586 // depth by (out_width * out_height * tensor_in_batch)
587 //
588 // 2. Walks through the set of columns in the flattened
589 // tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
590 // and updates the column(s) corresponding to the maximum values in
591 // tensor_out_as_matrix with the corresponding values in
592 // top_diff_as_matrix.
593 auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
594 int64_t start, int64_t limit) {
595 const int32_t depth = params.depth;
596 const int32_t in_rows = params.tensor_in_rows;
597 const int32_t in_cols = params.tensor_in_cols;
598 const int32_t pad_top = params.pad_top;
599 const int32_t pad_left = params.pad_left;
600 const int32_t window_rows = params.window_rows;
601 const int32_t window_cols = params.window_cols;
602 const int32_t row_stride = params.row_stride;
603 const int32_t col_stride = params.col_stride;
604 const int32_t out_height = params.out_height;
605 const int32_t out_width = params.out_width;
606
607 {
608 // Initializes the output grad backprop tensor with 0.
609 const int32_t output_image_size = out_height * out_width * params.depth;
610 EigenMatrixMap bottom_diff_shard(
611 bottom_diff_mat.data() + start * output_image_size, 1,
612 (limit - start) * output_image_size);
613 bottom_diff_shard.setZero();
614 }
615
616 for (int b = start; b < limit; ++b) {
617 for (int ph = 0; ph < out_height; ++ph) {
618 for (int pw = 0; pw < out_width; ++pw) {
619 // (h_start, h_end) * (w_start, w_end) is the range that the input
620 // vector projects to.
621 int h_start = ph * row_stride - pad_top;
622 const int h_end = std::min(h_start + window_rows, in_rows);
623 int w_start = pw * col_stride - pad_left;
624 const int w_end = std::min(w_start + window_cols, in_cols);
625 h_start = std::max(h_start, 0);
626 w_start = std::max(w_start, 0);
627 const int out_index = (b * out_height + ph) * out_width + pw;
628 // Find value corresponding to the input maximum in top_diff.
629 for (int d = 0; d < depth; ++d) {
630 const T& output_ref = out_mat.coeffRef(d, out_index);
631 bool should_stop = false;
632 for (int h = h_start; h < h_end && !should_stop; ++h) {
633 for (int w = w_start; w < w_end && !should_stop; ++w) {
634 const int in_index = (b * in_rows + h) * in_cols + w;
635 const T& input_ref = in_mat.coeffRef(d, in_index);
636 if (output_ref == input_ref) {
637 T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
638 bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
639 should_stop = true;
640 }
641 }
642 }
643 }
644 }
645 }
646 }
647 };
648
649 const int64_t shard_cost = params.out_width * params.out_height *
650 params.depth * params.window_rows *
651 params.window_cols;
652 Shard(worker_threads.num_threads, worker_threads.workers,
653 params.tensor_in_batch, shard_cost, shard);
654 }
655
656 std::vector<int32> ksize_;
657 std::vector<int32> stride_;
658 Padding padding_;
659 TensorFormat data_format_;
660 };
661
662 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
663
664 template <class T>
665 class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
666 public:
667 typedef Eigen::GpuDevice Device;
668
MaxPoolingGradGradOp(OpKernelConstruction * context)669 explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
670 : OpKernel(context) {
671 string data_format;
672 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
673 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
674 errors::InvalidArgument("Invalid data format"));
675 if (context->num_inputs() == 3) {
676 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
677 OP_REQUIRES(context, ksize_.size() == 4,
678 errors::InvalidArgument("Sliding window ksize field must "
679 "specify 4 dimensions"));
680 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
681 OP_REQUIRES(context, stride_.size() == 4,
682 errors::InvalidArgument("Sliding window strides field must "
683 "specify 4 dimensions"));
684 const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
685 const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
686 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
687 errors::Unimplemented(
688 "Pooling is not yet supported on the batch dimension."));
689 }
690 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
691 }
692
Compute(OpKernelContext * context)693 void Compute(OpKernelContext* context) override {
694 const Tensor& tensor_in = context->input(0);
695 const Tensor& tensor_out = context->input(1);
696 const Tensor& out_grad_backprop = context->input(2);
697
698 // For maxpooling, tensor_in should have 4 dimensions.
699 OP_REQUIRES(context, tensor_in.dims() == 4,
700 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
701 OP_REQUIRES(context, tensor_out.dims() == 4,
702 errors::InvalidArgument("tensor_out must be 4-dimensional"));
703 // For maxpooling, out_grad_backprop should have 4 dimensions.
704 OP_REQUIRES(
705 context, out_grad_backprop.dims() == 4,
706 errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
707
708 Tensor* output = nullptr;
709 OP_REQUIRES_OK(context,
710 context->allocate_output(0, tensor_out.shape(), &output));
711
712 std::vector<int32> ksize = ksize_;
713 std::vector<int32> stride = stride_;
714 if (context->num_inputs() == 5) {
715 const Tensor& tensor_ksize = context->input(3);
716 auto value_ksize = tensor_ksize.flat<int32>();
717 ksize.resize(tensor_ksize.shape().num_elements());
718 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
719
720 const Tensor& tensor_stride = context->input(4);
721 auto value_stride = tensor_stride.flat<int32>();
722 stride.resize(tensor_stride.shape().num_elements());
723 std::copy_n(&value_stride(0), stride.size(), stride.begin());
724 }
725
726 OP_REQUIRES(context, ksize.size() == 4,
727 errors::InvalidArgument("Sliding window ksize field must "
728 "specify 4 dimensions"));
729 OP_REQUIRES(context, stride.size() == 4,
730 errors::InvalidArgument("Sliding window strides field must "
731 "specify 4 dimensions"));
732 const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N');
733 const int32_t stride_n = GetTensorDim(stride, data_format_, 'N');
734 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
735 errors::Unimplemented(
736 "Pooling is not yet supported on the batch dimension."));
737
738 PoolParameters params{context,
739 ksize,
740 stride,
741 padding_,
742 /*explicit_paddings=*/{},
743 data_format_,
744 tensor_in.shape()};
745
746 functor::MaxPoolGradBackwardNoMask<T>()(
747 data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
748 params.tensor_in_batch, params.out_height, params.out_width,
749 params.depth, params.tensor_in_rows, params.tensor_in_cols,
750 params.window_rows, params.window_cols, params.row_stride,
751 params.col_stride, params.pad_top, params.pad_left,
752 out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
753 context->eigen_device<Eigen::GpuDevice>());
754 }
755
756 private:
757 std::vector<int32> ksize_;
758 std::vector<int32> stride_;
759 Padding padding_;
760 TensorFormat data_format_;
761 bool use_dnn_;
762 };
763
764 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
765
766 template <typename Device, typename T>
767 struct LaunchMaxPoolingNoMask;
768
769 template <typename Device, typename T>
770 class MaxPoolingNoMaskOp : public OpKernel {
771 public:
MaxPoolingNoMaskOp(OpKernelConstruction * context)772 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
773 : OpKernel(context) {
774 string data_format;
775 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
776 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
777 errors::InvalidArgument("Invalid data format"));
778 OP_REQUIRES(
779 context, data_format_ == FORMAT_NHWC,
780 errors::InvalidArgument(
781 "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
782 DeviceTypeString(context->device_type())));
783 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
784 OP_REQUIRES(context, ksize_.size() == 4,
785 errors::InvalidArgument("Sliding window ksize field must "
786 "specify 4 dimensions"));
787 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
788 OP_REQUIRES(context, stride_.size() == 4,
789 errors::InvalidArgument("Sliding window stride field must "
790 "specify 4 dimensions"));
791 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
792 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
793 errors::Unimplemented(
794 "Pooling is not yet supported on the batch dimension."));
795 OP_REQUIRES(
796 context, padding_ != EXPLICIT,
797 errors::Unimplemented(
798 "Explicit padding is not supported for MaxPoolingNoMaskOp."));
799 }
800
Compute(OpKernelContext * context)801 void Compute(OpKernelContext* context) override {
802 const Tensor& tensor_in = context->input(0);
803
804 PoolParameters params{context,
805 ksize_,
806 stride_,
807 padding_,
808 /*explicit_paddings=*/{},
809 data_format_,
810 tensor_in.shape()};
811 if (!context->status().ok()) {
812 return;
813 }
814
815 TensorShape out_shape({params.tensor_in_batch, params.out_height,
816 params.out_width, params.depth});
817 Tensor* output = nullptr;
818 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
819
820 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
821 output);
822 }
823
824 private:
825 std::vector<int32> ksize_;
826 std::vector<int32> stride_;
827 Padding padding_;
828 TensorFormat data_format_;
829 };
830
831 template <typename Device, typename T>
832 class MaxPoolingNoMaskV2Op : public OpKernel {
833 public:
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)834 explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
835 : OpKernel(context) {
836 string data_format;
837 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
838 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
839 errors::InvalidArgument("Invalid data format"));
840 OP_REQUIRES(
841 context, data_format_ == FORMAT_NHWC,
842 errors::InvalidArgument(
843 "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
844 DeviceTypeString(context->device_type())));
845 if (context->num_inputs() == 1) {
846 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
847 OP_REQUIRES(context, ksize_.size() == 4,
848 errors::InvalidArgument("Sliding window ksize field must "
849 "specify 4 dimensions"));
850 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
851 OP_REQUIRES(context, stride_.size() == 4,
852 errors::InvalidArgument("Sliding window stride field must "
853 "specify 4 dimensions"));
854 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
855 errors::Unimplemented(
856 "Pooling is not yet supported on the batch dimension."));
857 }
858 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
859 }
860
Compute(OpKernelContext * context)861 void Compute(OpKernelContext* context) override {
862 const Tensor& tensor_in = context->input(0);
863
864 std::vector<int32> ksize = ksize_;
865 std::vector<int32> stride = stride_;
866
867 if (context->num_inputs() != 1) {
868 const Tensor& tensor_ksize = context->input(1);
869 auto value_ksize = tensor_ksize.flat<int32>();
870 ksize.resize(tensor_ksize.shape().num_elements());
871 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
872
873 const Tensor& tensor_stride = context->input(2);
874 auto value_stride = tensor_stride.flat<int32>();
875 stride.resize(tensor_stride.shape().num_elements());
876 std::copy_n(&value_stride(0), stride.size(), stride.begin());
877 }
878 OP_REQUIRES(context, ksize.size() == 4,
879 errors::InvalidArgument("Sliding window ksize field must "
880 "specify 4 dimensions"));
881 OP_REQUIRES(context, stride.size() == 4,
882 errors::InvalidArgument("Sliding window stride field must "
883 "specify 4 dimensions"));
884 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
885 errors::Unimplemented(
886 "Pooling is not yet supported on the batch dimension."));
887 PoolParameters params{context,
888 ksize,
889 stride,
890 padding_,
891 /*explicit_paddings=*/{},
892 data_format_,
893 tensor_in.shape()};
894 if (!context->status().ok()) {
895 return;
896 }
897
898 TensorShape out_shape({params.tensor_in_batch, params.out_height,
899 params.out_width, params.depth});
900 Tensor* output = nullptr;
901 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
902
903 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
904 output);
905 }
906
907 private:
908 std::vector<int32> ksize_;
909 std::vector<int32> stride_;
910 Padding padding_;
911 TensorFormat data_format_;
912 };
913
914 template <typename Device, typename T, typename Targmax>
915 struct LaunchMaxPoolingWithArgmax;
916
917 template <typename T, typename Targmax>
918 struct LaunchMaxPoolingWithArgmax<CPUDevice, T, Targmax> {
launchtensorflow::LaunchMaxPoolingWithArgmax919 static void launch(OpKernelContext* context, const PoolParameters& params,
920 const Tensor& input, Tensor* output, Tensor* argmax,
921 bool propagate_nans, bool include_batch_in_index) {
922 Tensor unused;
923 SpatialMaxPoolWithArgMaxHelper<CPUDevice, T, Targmax>(
924 context, output, argmax, /*input_backprop=*/nullptr, input, unused,
925 params, include_batch_in_index);
926 }
927 };
928
929 template <typename Device, typename T, typename Targmax>
930 class MaxPoolingWithArgmaxOp : public OpKernel {
931 public:
MaxPoolingWithArgmaxOp(OpKernelConstruction * context)932 explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
933 : OpKernel(context) {
934 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
935 OP_REQUIRES(context, ksize_.size() == 4,
936 errors::InvalidArgument("Sliding window ksize field must "
937 "specify 4 dimensions"));
938 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
939 OP_REQUIRES(context, stride_.size() == 4,
940 errors::InvalidArgument("Sliding window stride field must "
941 "specify 4 dimensions"));
942 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
943 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
944 errors::Unimplemented(
945 "Pooling is not yet supported on the batch dimension."));
946 OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
947 &include_batch_in_index_));
948 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
949 &propagate_nans_));
950 }
951
Compute(OpKernelContext * context)952 void Compute(OpKernelContext* context) override {
953 const Tensor& tensor_in = context->input(0);
954 OP_REQUIRES(context, tensor_in.dims() == 4,
955 errors::InvalidArgument("tensor_in must be 4-dimensional (2)"));
956 OP_REQUIRES(context, tensor_in.NumElements() > 0,
957 errors::InvalidArgument("tensor_in must not be empty (2)"));
958
959 PoolParameters params{context,
960 ksize_,
961 stride_,
962 padding_,
963 /*explicit_paddings=*/{},
964 FORMAT_NHWC,
965 tensor_in.shape()};
966 if (!context->status().ok()) {
967 return;
968 }
969
970 TensorShape out_shape({params.tensor_in_batch, params.out_height,
971 params.out_width, params.depth});
972 Tensor* output = nullptr;
973 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
974 Tensor* argmax = nullptr;
975 OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
976
977 LaunchMaxPoolingWithArgmax<Device, T, Targmax>::launch(
978 context, params, tensor_in, output, argmax, propagate_nans_,
979 include_batch_in_index_);
980 }
981
982 private:
983 std::vector<int32> ksize_;
984 std::vector<int32> stride_;
985 Padding padding_;
986 bool propagate_nans_;
987 bool include_batch_in_index_;
988 };
989
990 template <typename Device, typename T>
991 struct LaunchMaxPoolingGradWithArgmax;
992
993 template <typename T>
994 struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
995 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
996 EigenMatrixMap;
997
launchtensorflow::LaunchMaxPoolingGradWithArgmax998 static void launch(OpKernelContext* context, const PoolParameters& params,
999 const Tensor& grad_in, const Tensor& argmax,
1000 Tensor* grad_out, const bool include_batch_in_index) {
1001 const DeviceBase::CpuWorkerThreads& worker_threads =
1002 *(context->device()->tensorflow_cpu_worker_threads());
1003
1004 auto shard = [&grad_in, &argmax, &grad_out, include_batch_in_index](
1005 int64_t start, int64_t limit) {
1006 const int64_t batch_size =
1007 GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
1008 const int64_t output_size_per_batch =
1009 grad_out->NumElements() / batch_size;
1010 const int64_t input_size_per_batch = grad_in.NumElements() / batch_size;
1011
1012 {
1013 auto grad_out_flat = grad_out->flat<T>();
1014 auto argmax_flat = argmax.flat<int64>();
1015 auto grad_in_flat = grad_in.flat<T>();
1016
1017 const int64_t output_start = start * output_size_per_batch;
1018 const int64_t output_end = limit * output_size_per_batch;
1019 EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
1020 output_end - output_start);
1021 inputShard.setConstant(T(0));
1022
1023 const int input_start = start * input_size_per_batch;
1024 const int input_end = limit * input_size_per_batch;
1025 for (int64_t index = input_start; index < input_end; index++) {
1026 if (index >= argmax.NumElements()) {
1027 break;
1028 }
1029 int64_t grad_out_index = argmax_flat(index);
1030 if (!include_batch_in_index) {
1031 const int64_t cur_batch = index / input_size_per_batch;
1032 grad_out_index += cur_batch * output_size_per_batch;
1033 }
1034 CHECK(grad_out_index >= output_start && grad_out_index < output_end)
1035 << "Invalid output gradient index: " << grad_out_index << ", "
1036 << output_start << ", " << output_end;
1037 grad_out_flat(grad_out_index) += grad_in_flat(index);
1038 }
1039 }
1040 };
1041
1042 const int64_t batch_size =
1043 GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
1044 const int64_t shard_cost = grad_out->NumElements() / batch_size;
1045 Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
1046 shard_cost, shard);
1047 }
1048 };
1049
1050 // TODO(b/175733711): Support int32 argmax type in MaxPoolGradWithArgmax op.
1051 template <typename Device, typename T>
1052 class MaxPoolingGradWithArgmaxOp : public OpKernel {
1053 public:
MaxPoolingGradWithArgmaxOp(OpKernelConstruction * context)1054 explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
1055 : OpKernel(context) {
1056 string data_format_str;
1057 if (std::is_same<Device, GPUDevice>::value) {
1058 OP_REQUIRES(context, !tensorflow::OpDeterminismRequired(),
1059 errors::Unimplemented("Determinism is not yet supported "
1060 "for MaxPoolGradWithArgmax."));
1061 }
1062 auto status = context->GetAttr("data_format", &data_format_str);
1063 if (status.ok()) {
1064 OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
1065 errors::InvalidArgument("Invalid data format"));
1066 }
1067
1068 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1069 OP_REQUIRES(context, ksize_.size() == 4,
1070 errors::InvalidArgument("Sliding window ksize field must "
1071 "specify 4 dimensions"));
1072 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1073 OP_REQUIRES(context, stride_.size() == 4,
1074 errors::InvalidArgument("Sliding window stride field must "
1075 "specify 4 dimensions"));
1076 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1077 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1078 errors::Unimplemented(
1079 "Pooling is not yet supported on the batch dimension."));
1080 OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1081 &include_batch_in_index_));
1082 }
1083
Compute(OpKernelContext * context)1084 void Compute(OpKernelContext* context) override {
1085 const Tensor& tensor_in = context->input(0);
1086 const Tensor& grad_in = context->input(1);
1087 const Tensor& argmax = context->input(2);
1088
1089 PoolParameters params{context,
1090 ksize_,
1091 stride_,
1092 padding_,
1093 /*explicit_paddings=*/{},
1094 FORMAT_NHWC,
1095 tensor_in.shape()};
1096 if (!context->status().ok()) {
1097 return;
1098 }
1099
1100 TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
1101 params.tensor_in_cols, params.depth});
1102 Tensor* grad_out = nullptr;
1103 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1104 {0}, 0, out_shape, &grad_out));
1105
1106 if (out_shape.num_elements() == 0) return; // nothing to be done
1107
1108 LaunchMaxPoolingGradWithArgmax<Device, T>::launch(
1109 context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1110 }
1111
1112 private:
1113 std::vector<int32> ksize_;
1114 std::vector<int32> stride_;
1115 Padding padding_;
1116 TensorFormat data_format_;
1117 bool include_batch_in_index_;
1118 };
1119
1120 template <typename Device, typename T>
1121 struct LaunchMaxPoolingGradGradWithArgmax;
1122
1123 template <typename Device, typename T>
1124 class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
1125 public:
MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction * context)1126 explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
1127 : OpKernel(context) {
1128 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1129 OP_REQUIRES(context, ksize_.size() == 4,
1130 errors::InvalidArgument("Sliding window ksize field must "
1131 "specify 4 dimensions"));
1132 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1133 OP_REQUIRES(context, stride_.size() == 4,
1134 errors::InvalidArgument("Sliding window stride field must "
1135 "specify 4 dimensions"));
1136 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1137 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1138 errors::Unimplemented(
1139 "Pooling is not yet supported on the batch dimension."));
1140 OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1141 &include_batch_in_index_));
1142 }
1143
Compute(OpKernelContext * context)1144 void Compute(OpKernelContext* context) override {
1145 const Tensor& tensor_in = context->input(0);
1146 const Tensor& grad_in = context->input(1);
1147 const Tensor& argmax = context->input(2);
1148
1149 PoolParameters params{context,
1150 ksize_,
1151 stride_,
1152 padding_,
1153 /*explicit_paddings=*/{},
1154 FORMAT_NHWC,
1155 tensor_in.shape()};
1156 if (!context->status().ok()) {
1157 return;
1158 }
1159
1160 TensorShape out_shape({params.tensor_in_batch, params.out_height,
1161 params.out_width, params.depth});
1162
1163 Tensor* grad_out = nullptr;
1164 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1165 {0}, 0, out_shape, &grad_out));
1166
1167 LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
1168 context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1169 }
1170
1171 private:
1172 std::vector<int32> ksize_;
1173 std::vector<int32> stride_;
1174 Padding padding_;
1175 bool include_batch_in_index_;
1176 };
1177
1178 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1179 template <typename T>
1180 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
1181 public:
1182 typedef GPUDevice Device;
MaxPoolingNoMaskOp(OpKernelConstruction * context)1183 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
1184 : OpKernel(context) {
1185 string data_format;
1186 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1187 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1188 errors::InvalidArgument("Invalid data format"));
1189 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1190 OP_REQUIRES(context, ksize_.size() == 4,
1191 errors::InvalidArgument("Sliding window ksize field must "
1192 "specify 4 dimensions"));
1193 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1194 OP_REQUIRES(context, stride_.size() == 4,
1195 errors::InvalidArgument("Sliding window stride field must "
1196 "specify 4 dimensions"));
1197 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1198 OP_REQUIRES_OK(context,
1199 context->GetAttr("explicit_paddings", &explicit_paddings_));
1200 const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1201 const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
1202 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1203 errors::Unimplemented(
1204 "Pooling is not yet supported on the batch dimension."));
1205
1206 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1207 &propagate_nans_));
1208 }
1209
Compute(OpKernelContext * context)1210 void Compute(OpKernelContext* context) override {
1211 const Tensor& tensor_in = context->input(0);
1212
1213 PoolParameters params{
1214 context, ksize_, stride_, padding_, explicit_paddings_,
1215 data_format_, tensor_in.shape()};
1216 if (!context->status().ok()) {
1217 return;
1218 }
1219
1220 TensorShape out_shape =
1221 ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1222 params.out_width, params.depth);
1223
1224 // Assuming qint8 <--> NCHW_VECT_C (int8x4) here.
1225 constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
1226 OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
1227 errors::InvalidArgument(
1228 "qint8 should be used with data_format NCHW_VECT_C."));
1229
1230 #if CUDNN_VERSION >= 7300
1231 DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1232 stride_, padding_, explicit_paddings_,
1233 data_format_, tensor_in, out_shape,
1234 propagate_nans_);
1235 #else
1236 // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
1237 if (!is_int8x4 && data_format_ == FORMAT_NCHW) {
1238 DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1239 stride_, padding_, explicit_paddings_,
1240 data_format_, tensor_in, out_shape,
1241 propagate_nans_);
1242 } else {
1243 #if !defined(TENSORFLOW_USE_ROCM)
1244 OP_REQUIRES(context, padding_ != EXPLICIT,
1245 errors::Unimplemented("Explicit padding is not supported ",
1246 "when CUDNN is not enabled."));
1247 #endif
1248 Tensor* output = nullptr;
1249 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1250 if (is_int8x4) {
1251 LaunchMaxPoolingNoMask_NCHW_VECT_C<Device>::launch(context, params,
1252 tensor_in, output);
1253 } else if (data_format_ == FORMAT_NHWC) {
1254 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1255 output, propagate_nans_);
1256 } else {
1257 LOG(FATAL) << "MaxPool currently only supports the following (layout, "
1258 "type) combinations: (NHWC, non-qint8), "
1259 "(NCHW, non-qint8) or (NCHW_VECT_C, qint8). The "
1260 "requested combination ("
1261 << ToString(data_format_) << ", "
1262 << DataTypeString(DataTypeToEnum<T>::v())
1263 << ") is not supported.";
1264 }
1265 }
1266 #endif
1267 }
1268
1269 private:
1270 std::vector<int32> ksize_;
1271 std::vector<int32> stride_;
1272 Padding padding_;
1273 std::vector<int64> explicit_paddings_;
1274 TensorFormat data_format_;
1275 bool propagate_nans_;
1276 };
1277
1278 template <typename T>
1279 class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
1280 public:
1281 typedef GPUDevice Device;
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)1282 explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
1283 : OpKernel(context) {
1284 string data_format;
1285 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1286 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1287 errors::InvalidArgument("Invalid data format"));
1288 if (context->num_inputs() == 1) {
1289 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1290 OP_REQUIRES(context, ksize_.size() == 4,
1291 errors::InvalidArgument("Sliding window ksize field must "
1292 "specify 4 dimensions"));
1293 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1294 OP_REQUIRES(context, stride_.size() == 4,
1295 errors::InvalidArgument("Sliding window stride field must "
1296 "specify 4 dimensions"));
1297 const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1298 const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N');
1299 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1300 errors::Unimplemented(
1301 "Pooling is not yet supported on the batch dimension."));
1302 }
1303 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1304 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1305 &propagate_nans_));
1306 }
1307
Compute(OpKernelContext * context)1308 void Compute(OpKernelContext* context) override {
1309 const Tensor& tensor_in = context->input(0);
1310
1311 std::vector<int32> ksize = ksize_;
1312 std::vector<int32> stride = stride_;
1313
1314 if (context->num_inputs() != 1) {
1315 const Tensor& tensor_ksize = context->input(1);
1316 auto value_ksize = tensor_ksize.flat<int32>();
1317 ksize.resize(tensor_ksize.shape().num_elements());
1318 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
1319
1320 const Tensor& tensor_stride = context->input(2);
1321 auto value_stride = tensor_stride.flat<int32>();
1322 stride.resize(tensor_stride.shape().num_elements());
1323 std::copy_n(&value_stride(0), stride.size(), stride.begin());
1324 }
1325 OP_REQUIRES(context, ksize.size() == 4,
1326 errors::InvalidArgument("Sliding window ksize field must "
1327 "specify 4 dimensions"));
1328 OP_REQUIRES(context, stride.size() == 4,
1329 errors::InvalidArgument("Sliding window stride field must "
1330 "specify 4 dimensions"));
1331 const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N');
1332 const int32_t stride_n = GetTensorDim(stride, data_format_, 'N');
1333 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1334 errors::Unimplemented(
1335 "Pooling is not yet supported on the batch dimension."));
1336
1337 PoolParameters params{context,
1338 ksize,
1339 stride,
1340 padding_,
1341 /*explicit_paddings=*/{},
1342 data_format_,
1343 tensor_in.shape()};
1344 if (!context->status().ok()) {
1345 return;
1346 }
1347
1348 TensorShape out_shape =
1349 ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1350 params.out_width, params.depth);
1351 if (data_format_ == FORMAT_NCHW) {
1352 DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
1353 stride, padding_, explicit_paddings_,
1354 data_format_, tensor_in, out_shape,
1355 propagate_nans_);
1356 } else {
1357 CHECK(data_format_ == FORMAT_NHWC)
1358 << "MaxPool only supports NCHW or NHWC format";
1359 Tensor* output = nullptr;
1360 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1361 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1362 output, propagate_nans_);
1363 }
1364 }
1365
1366 private:
1367 std::vector<int32> ksize_;
1368 std::vector<int32> stride_;
1369 Padding padding_;
1370 std::vector<int64> explicit_paddings_;
1371 TensorFormat data_format_;
1372 bool propagate_nans_;
1373 };
1374
1375 template <typename T>
1376 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingNoMask1377 static void launch(OpKernelContext* context, const PoolParameters& params,
1378 const Tensor& input, Tensor* output, bool propagate_nans) {
1379 bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1380 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1381 params.tensor_in_cols, params.depth, params.out_height,
1382 params.out_width, params.window_rows, params.window_cols,
1383 params.row_stride, params.col_stride, params.pad_top, params.pad_left,
1384 output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
1385 propagate_nans, false);
1386 if (!status) {
1387 context->SetStatus(
1388 errors::Internal("Failed launching MaxPoolForwardNoMask"));
1389 }
1390 }
1391 };
1392
1393 template <typename T>
1394 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T, int64> {
launchtensorflow::LaunchMaxPoolingWithArgmax1395 static void launch(OpKernelContext* context, const PoolParameters& params,
1396 const Tensor& input, Tensor* output, Tensor* argmax,
1397 bool propagate_nans, bool include_batch_in_index) {
1398 bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1399 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1400 params.tensor_in_cols, params.depth, params.out_height,
1401 params.out_width, params.window_rows, params.window_cols,
1402 params.row_stride, params.col_stride, params.pad_top, params.pad_left,
1403 output->flat<T>().data(),
1404 reinterpret_cast<int64*>(argmax->flat<int64>().data()),
1405 context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
1406 if (!status) {
1407 context->SetStatus(
1408 errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
1409 }
1410 }
1411 };
1412
1413 template <typename T>
1414 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradWithArgmax1415 static void launch(OpKernelContext* context, const PoolParameters& params,
1416 const Tensor& grad_in, const Tensor& argmax,
1417 Tensor* grad_out, const bool include_batch_in_index) {
1418 const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1419 params.tensor_in_cols * params.depth;
1420 const int output_size = params.tensor_in_batch * params.out_height *
1421 params.out_width * params.depth;
1422 const int top_offset = params.out_height * params.out_width * params.depth;
1423 const int bottom_offset =
1424 params.tensor_in_rows * params.tensor_in_cols * params.depth;
1425 bool status = functor::MaxPoolBackwardWithArgmax<T>()(
1426 output_size, input_size, grad_in.flat<T>().data(),
1427 reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1428 bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1429 include_batch_in_index);
1430 if (!status) {
1431 context->SetStatus(
1432 errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
1433 }
1434 }
1435 };
1436
1437 template <typename T>
1438 struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradGradWithArgmax1439 static void launch(OpKernelContext* context, const PoolParameters& params,
1440 const Tensor& grad_in, const Tensor& argmax,
1441 Tensor* grad_out, const bool include_batch_in_index) {
1442 const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1443 params.tensor_in_cols * params.depth;
1444 const int output_size = params.tensor_in_batch * params.out_height *
1445 params.out_width * params.depth;
1446 const int top_offset =
1447 params.tensor_in_rows * params.tensor_in_cols * params.depth;
1448 const int bottom_offset =
1449 params.out_width * params.out_height * params.depth;
1450 bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
1451 output_size, input_size, grad_in.flat<T>().data(),
1452 reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1453 bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1454 include_batch_in_index);
1455 if (!status) {
1456 context->SetStatus(
1457 errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
1458 }
1459 }
1460 };
1461
1462 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1463
1464 #define REGISTER_MAX_POOL_KERNELS(D, T) \
1465 REGISTER_KERNEL_BUILDER( \
1466 Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
1467 MaxPoolingGradOp<D##Device, T>); \
1468 REGISTER_KERNEL_BUILDER( \
1469 Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
1470 MaxPoolingGradGradOp<D##Device, T>); \
1471 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2") \
1472 .Device(DEVICE_##D) \
1473 .HostMemory("ksize") \
1474 .HostMemory("strides") \
1475 .TypeConstraint<T>("T"), \
1476 MaxPoolingGradOp<D##Device, T>); \
1477 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2") \
1478 .Device(DEVICE_##D) \
1479 .HostMemory("ksize") \
1480 .HostMemory("strides") \
1481 .TypeConstraint<T>("T"), \
1482 MaxPoolingGradGradOp<D##Device, T>) \
1483 REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") \
1484 .Device(DEVICE_##D) \
1485 .TypeConstraint<int64>("Targmax") \
1486 .TypeConstraint<T>("T"), \
1487 MaxPoolingWithArgmaxOp<D##Device, T, int64>); \
1488 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") \
1489 .Device(DEVICE_##D) \
1490 .TypeConstraint<T>("T") \
1491 .TypeConstraint<int64>("Targmax"), \
1492 MaxPoolingGradWithArgmaxOp<D##Device, T>);
1493
1494 // Below kernels implemented only for CPU device.
1495 #define REGISTER_CPU_ONLY_POOL_KERNELS(T) \
1496 REGISTER_KERNEL_BUILDER( \
1497 Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
1498 MaxPoolingOp<CPUDevice, T>); \
1499 REGISTER_KERNEL_BUILDER( \
1500 Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
1501 MaxPoolingV2Op<CPUDevice, T>); \
1502 REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") \
1503 .Device(DEVICE_CPU) \
1504 .TypeConstraint<int32>("Targmax") \
1505 .TypeConstraint<T>("T"), \
1506 MaxPoolingWithArgmaxOp<CPUDevice, T, int32>);
1507 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
1508 #undef REGISTER_CPU_ONLY_POOL_KERNELS
1509
1510 #define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
1511 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
1512 #undef REGISTER_CPU_KERNELS
1513
1514 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1515
1516 // Forward declarations for the functor specializations for GPU.
1517 namespace functor {
1518 #define DECLARE_GPU_SPEC(T) \
1519 template <> \
1520 void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \
1521 const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
1522 typename TTypes<T, 4>::ConstTensor input, int window_rows, \
1523 int window_cols, int row_stride, int col_stride, \
1524 const Eigen::PaddingType& padding); \
1525 extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
1526
1527 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
1528 #undef DECLARE_GPU_SPEC
1529 } // namespace functor
1530
1531 #define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
1532 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
1533 #undef REGISTER_GPU_MAX_POOL_KERNELS
1534
1535 // Below kernels currently implemented only for GPU device.
1536 // Note(jiayq): Currently, the Caffe custom implementation is faster than the
1537 // default Eigen implementation so we are using the custom kernel as the
1538 // default. However, you can explicitly invoke the eigen version using
1539 // kernel_label_map.
1540 #define REGISTER_GPU_ONLY_POOL_KERNELS(T) \
1541 REGISTER_KERNEL_BUILDER(Name("MaxPool") \
1542 .Device(DEVICE_GPU) \
1543 .TypeConstraint<T>("T") \
1544 .Label("eigen_tensor"), \
1545 MaxPoolingOp<GPUDevice, T>); \
1546 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \
1547 .Device(DEVICE_GPU) \
1548 .HostMemory("ksize") \
1549 .HostMemory("strides") \
1550 .TypeConstraint<T>("T") \
1551 .Label("eigen_tensor"), \
1552 MaxPoolingV2Op<GPUDevice, T>); \
1553 REGISTER_KERNEL_BUILDER( \
1554 Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
1555 MaxPoolingNoMaskOp<GPUDevice, T>); \
1556 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \
1557 .Device(DEVICE_GPU) \
1558 .HostMemory("ksize") \
1559 .HostMemory("strides") \
1560 .TypeConstraint<T>("T"), \
1561 MaxPoolingNoMaskV2Op<GPUDevice, T>); \
1562 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax") \
1563 .Device(DEVICE_GPU) \
1564 .TypeConstraint<T>("T") \
1565 .TypeConstraint<int64>("Targmax"), \
1566 MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
1567 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
1568
1569 // TODO(b/65847473): Re-enable once the underlying build error is fixed.
1570 #if !defined(PLATFORM_WINDOWS)
1571 REGISTER_KERNEL_BUILDER(
1572 Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
1573 MaxPoolingNoMaskOp<GPUDevice, qint8>);
1574
1575 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1576 .Device(DEVICE_GPU)
1577 .HostMemory("ksize")
1578 .HostMemory("strides")
1579 .TypeConstraint<qint8>("T"),
1580 MaxPoolingV2Op<GPUDevice, qint8>);
1581
1582 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1583 .Device(DEVICE_GPU)
1584 .HostMemory("ksize")
1585 .HostMemory("strides")
1586 .TypeConstraint<qint8>("T")
1587 .Label("eigen_tensor"),
1588 MaxPoolingV2Op<GPUDevice, qint8>);
1589 #endif // !defined(PLATFORM_WINDOWS)
1590
1591 #undef REGISTER_GPU_ONLY_POOL_KERNELS
1592
1593 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1594
1595 #undef REGISTER_MAX_POOL_KERNELS
1596
1597 } // namespace tensorflow
1598