• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/nn_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #include "tensorflow/core/kernels/avgpooling_op.h"
21 
22 #include <vector>
23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
24 #include "tensorflow/core/framework/numeric_op.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/register_types.h"
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/framework/tensor_shape.h"
29 #include "tensorflow/core/framework/tensor_slice.h"
30 #include "tensorflow/core/kernels/eigen_pooling.h"
31 #include "tensorflow/core/kernels/ops_util.h"
32 #include "tensorflow/core/kernels/pooling_ops_common.h"
33 #include "tensorflow/core/lib/core/errors.h"
34 #include "tensorflow/core/lib/gtl/array_slice.h"
35 #include "tensorflow/core/platform/logging.h"
36 #include "tensorflow/core/util/padding.h"
37 #include "tensorflow/core/util/tensor_format.h"
38 
39 #if GOOGLE_CUDA
40 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
41 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
42 #endif  // GOOGLE_CUDA
43 
44 namespace tensorflow {
45 
46 typedef Eigen::ThreadPoolDevice CPUDevice;
47 typedef Eigen::GpuDevice GPUDevice;
48 
49 template <typename Device, typename T>
50 class AvgPoolingOp : public UnaryOp<T> {
51  public:
AvgPoolingOp(OpKernelConstruction * context)52   explicit AvgPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
53     string data_format;
54     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
55     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
56                 errors::InvalidArgument("Invalid data format"));
57     OP_REQUIRES(
58         context, data_format_ == FORMAT_NHWC,
59         errors::InvalidArgument("Default AvgPoolingOp only supports NHWC ",
60                                 "on device type ",
61                                 DeviceTypeString(context->device_type())));
62     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
63     OP_REQUIRES(context, ksize_.size() == 4,
64                 errors::InvalidArgument("Sliding window ksize field must "
65                                         "specify 4 dimensions"));
66     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
67     OP_REQUIRES(context, stride_.size() == 4,
68                 errors::InvalidArgument("Sliding window stride field must "
69                                         "specify 4 dimensions"));
70     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
71     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
72                 errors::Unimplemented(
73                     "Pooling is not yet supported on the batch dimension."));
74   }
75 
Compute(OpKernelContext * context)76   void Compute(OpKernelContext* context) override {
77     const Tensor& tensor_in = context->input(0);
78     PoolParameters params{context,  ksize_,       stride_,
79                           padding_, data_format_, tensor_in.shape()};
80     if (!context->status().ok()) {
81       return;
82     }
83     OP_REQUIRES(context, params.depth_window == 1,
84                 errors::Unimplemented("Non-spatial pooling is not "
85                                       "yet supported. Volunteers? :)"));
86 
87     // For avgpooling, tensor_in should have 4 dimensions.
88     OP_REQUIRES(context, tensor_in.dims() == 4,
89                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
90 
91     Tensor* output = nullptr;
92     OP_REQUIRES_OK(context, context->allocate_output(
93                                 0, params.forward_output_shape(), &output));
94 
95     SpatialAvgPool<Device, T>(context, output, tensor_in, params, padding_);
96   }
97 
98  private:
99   std::vector<int32> ksize_;
100   std::vector<int32> stride_;
101   Padding padding_;
102   TensorFormat data_format_;
103 };
104 
105 REGISTER_KERNEL_BUILDER(
106     Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<double>("T"),
107     AvgPoolingOp<CPUDevice, double>);
108 REGISTER_KERNEL_BUILDER(
109     Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
110     AvgPoolingOp<CPUDevice, float>);
111 REGISTER_KERNEL_BUILDER(
112     Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
113     AvgPoolingOp<CPUDevice, Eigen::half>);
114 
115 #if GOOGLE_CUDA
116 template <typename T>
117 class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
118  public:
119   typedef GPUDevice Device;
AvgPoolingOp(OpKernelConstruction * context)120   explicit AvgPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
121     string data_format;
122     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
123     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
124                 errors::InvalidArgument("Invalid data format"));
125     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
126     OP_REQUIRES(context, ksize_.size() == 4,
127                 errors::InvalidArgument("Sliding window ksize field must "
128                                         "specify 4 dimensions"));
129     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
130     OP_REQUIRES(context, stride_.size() == 4,
131                 errors::InvalidArgument("Sliding window stride field must "
132                                         "specify 4 dimensions"));
133     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
134     const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
135     const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
136     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
137                 errors::Unimplemented(
138                     "Pooling is not yet supported on the batch dimension."));
139   }
140 
Compute(OpKernelContext * context)141   void Compute(OpKernelContext* context) override {
142     const Tensor& tensor_in = context->input(0);
143     PoolParameters params{context,  ksize_,       stride_,
144                           padding_, data_format_, tensor_in.shape()};
145     if (!context->status().ok()) {
146       return;
147     }
148     OP_REQUIRES(context, params.depth_window == 1,
149                 errors::Unimplemented("Non-spatial pooling is not "
150                                       "yet supported. Volunteers? :)"));
151 
152     // For avgpooling, tensor_in should have 4 dimensions.
153     OP_REQUIRES(context, tensor_in.dims() == 4,
154                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
155 
156     TensorShape output_shape = params.forward_output_shape();
157 
158     if (data_format_ == FORMAT_NCHW) {
159       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
160                                stride_, padding_, data_format_, tensor_in,
161                                output_shape,
162                                /*propagate_nans=*/false);
163     } else {
164       Tensor* output = nullptr;
165       OP_REQUIRES_OK(context,
166                      context->allocate_output(0, output_shape, &output));
167       Eigen::PaddingType pt = BrainPadding2EigenPadding(padding_);
168       functor::SpatialAvgPooling<Device, T>()(
169           context->eigen_device<Device>(), output->tensor<T, 4>(),
170           tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
171           params.row_stride, params.col_stride, pt);
172     }
173   }
174 
175  private:
176   std::vector<int32> ksize_;
177   std::vector<int32> stride_;
178   Padding padding_;
179   TensorFormat data_format_;
180 };
181 
182 // Forward declarations of the functor specializations for GPU.
183 namespace functor {
184 #define DECLARE_GPU_SPEC(T)                                      \
185   template <>                                                    \
186   void SpatialAvgPooling<GPUDevice, T>::operator()(              \
187       const GPUDevice& d, typename TTypes<T, 4>::Tensor output,  \
188       typename TTypes<T, 4>::ConstTensor input, int window_rows, \
189       int window_cols, int row_stride, int col_stride,           \
190       const Eigen::PaddingType& padding);                        \
191   extern template struct SpatialAvgPooling<GPUDevice, T>;
192 
193 DECLARE_GPU_SPEC(Eigen::half);
194 DECLARE_GPU_SPEC(float);
195 DECLARE_GPU_SPEC(double);
196 #undef DECLARE_GPU_SPEC
197 }  // namespace functor
198 
199 REGISTER_KERNEL_BUILDER(
200     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
201     AvgPoolingOp<GPUDevice, Eigen::half>);
202 REGISTER_KERNEL_BUILDER(
203     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
204     AvgPoolingOp<GPUDevice, float>);
205 REGISTER_KERNEL_BUILDER(
206     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<double>("T"),
207     AvgPoolingOp<GPUDevice, double>);
208 #endif  // GOOGLE_CUDA
209 
210 // The operation to compute AvgPool gradients.
211 // It takes two inputs:
212 //   - The original input tensor shape
213 //   - Backprop tensor for output
214 // It produces one output: backprop tensor for input.
215 template <typename Device, class T>
216 class AvgPoolingGradOp : public OpKernel {
217  public:
AvgPoolingGradOp(OpKernelConstruction * context)218   explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
219     string data_format;
220     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
221     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
222                 errors::InvalidArgument("Invalid data format"));
223     OP_REQUIRES(
224         context, data_format_ == FORMAT_NHWC,
225         errors::InvalidArgument("Default AvgPoolingGradOp only supports NHWC ",
226                                 "on device type ",
227                                 DeviceTypeString(context->device_type())));
228     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
229     OP_REQUIRES(context, ksize_.size() == 4,
230                 errors::InvalidArgument("Sliding window ksize field must "
231                                         "specify 4 dimensions"));
232     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
233     OP_REQUIRES(context, stride_.size() == 4,
234                 errors::InvalidArgument("Sliding window strides field must "
235                                         "specify 4 dimensions"));
236     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
237     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
238                 errors::Unimplemented(
239                     "Pooling is not yet supported on the batch dimension."));
240   }
241 
Compute(OpKernelContext * context)242   void Compute(OpKernelContext* context) override {
243     const Tensor& tensor_in_shape = context->input(0);
244     const Tensor& out_backprop = context->input(1);
245     // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
246     OP_REQUIRES(
247         context,
248         tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
249         errors::InvalidArgument("out_backprop must be 1-dimensional and 4 "
250                                 "elements"));
251     // For avgpooling, out_backprop should have 4 dimensions.
252     OP_REQUIRES(context, out_backprop.dims() == 4,
253                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
254     const int64 out_backprop_batch = out_backprop.dim_size(0);
255     const int64 out_backprop_rows = out_backprop.dim_size(1);
256     const int64 out_backprop_cols = out_backprop.dim_size(2);
257     const int64 out_backprop_depth = out_backprop.dim_size(3);
258 
259     TensorShape output_shape;
260     auto shape_vec = tensor_in_shape.vec<int32>();
261     for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
262       output_shape.AddDim(shape_vec(i));
263     }
264     const int64 in_rows = output_shape.dim_size(1);
265     const int64 in_cols = output_shape.dim_size(2);
266 
267     Tensor* output = nullptr;
268     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
269     output->flat<T>().setZero();
270 
271     const int window_rows = ksize_[1];
272     const int window_cols = ksize_[2];
273     const int depth_window = ksize_[3];
274 
275     const int row_stride = stride_[1];
276     const int col_stride = stride_[2];
277 
278     // We (will) use different code for spatial pooling and
279     // non-spatial pooling.
280     //
281     // Spatial pooling is when depth_window = 1
282     OP_REQUIRES(context, depth_window == 1,
283                 errors::Unimplemented("Non-spatial pooling is not "
284                                       "yet supported. Volunteers? :)"));
285 
286     int64 out_height, out_width, pad_rows, pad_cols;
287     OP_REQUIRES_OK(context,
288                    GetWindowedOutputSize(in_rows, window_rows, row_stride,
289                                          padding_, &out_height, &pad_rows));
290     OP_REQUIRES_OK(context,
291                    GetWindowedOutputSize(in_cols, window_cols, col_stride,
292                                          padding_, &out_width, &pad_cols));
293 
294     const T* out_backprop_ptr = out_backprop.flat<T>().data();
295     T* input_backprop_ptr = output->flat<T>().data();
296 
297     auto shard = [context, out_backprop_ptr, input_backprop_ptr,
298                   out_backprop_rows, out_backprop_cols, out_backprop_depth,
299                   in_rows, in_cols, window_rows, window_cols, row_stride,
300                   col_stride, pad_rows, pad_cols](int64 start, int64 limit) {
301       for (int64 b = start; b < limit; ++b) {
302         for (int64 r = 0; r < out_backprop_rows; ++r) {
303           // Calculates row broadcast size.  For SAME padding, current
304           // index could be in the padding area, and r*row_stride +
305           // window_rows could be beyond the input tensor's boundary. In
306           // such cases, change the starting index and reduce the
307           // broadcast size.
308           int rindex, rsize;
309           OP_REQUIRES_OK(context,
310                          GetBroadcastSize(r, in_rows, window_rows, row_stride,
311                                           pad_rows, &rindex, &rsize));
312           for (int64 c = 0; c < out_backprop_cols; ++c) {
313             // Calculates col broadcast size.  For SAME padding, current
314             // index could be in the padding area, and c*col_stride +
315             // window_cols could be beyond the input tensor's boundary. In
316             // such cases, change the starting index and reduce the
317             // broadcast size.
318             int cindex, csize;
319             OP_REQUIRES_OK(context,
320                            GetBroadcastSize(c, in_cols, window_cols, col_stride,
321                                             pad_cols, &cindex, &csize));
322 
323             T divide_coeff(1.0 / (rsize * csize));
324             int64 output_index =
325                 (b * out_backprop_rows + r) * out_backprop_cols + c;
326             for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) {
327               for (int64 c_dst = cindex; c_dst < cindex + csize; ++c_dst) {
328                 int64 input_index = (b * in_rows + r_dst) * in_cols + c_dst;
329                 const T* output_offset =
330                     out_backprop_ptr + output_index * out_backprop_depth;
331                 T* input_offset =
332                     input_backprop_ptr + input_index * out_backprop_depth;
333                 for (int64 d = 0; d < out_backprop_depth; ++d) {
334                   *input_offset += *output_offset * divide_coeff;
335                   ++output_offset;
336                   ++input_offset;
337                 }
338               }
339             }
340           }
341         }
342       }
343     };
344 
345     const DeviceBase::CpuWorkerThreads& worker_threads =
346         *(context->device()->tensorflow_cpu_worker_threads());
347     const int64 shard_cost =
348         window_rows * window_cols * depth_window * in_rows * in_rows * in_cols;
349     Shard(worker_threads.num_threads, worker_threads.workers,
350           out_backprop_batch, shard_cost, shard);
351   }
352 
353  private:
354   std::vector<int32> ksize_;
355   std::vector<int32> stride_;
356   Padding padding_;
357   TensorFormat data_format_;
358 };
359 
360 #define REGISTER_CPU_KERNEL(T)                                 \
361   REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")                  \
362                               .Device(DEVICE_CPU)              \
363                               .TypeConstraint<T>("T")          \
364                               .HostMemory("orig_input_shape"), \
365                           AvgPoolingGradOp<CPUDevice, T>);
366 
367 TF_CALL_float(REGISTER_CPU_KERNEL);
368 TF_CALL_double(REGISTER_CPU_KERNEL);
369 TF_CALL_half(REGISTER_CPU_KERNEL);
370 
371 #if GOOGLE_CUDA
372 
373 // A CUDNN based AvgPoolingGrad implementation. It includes the padding as the
374 // candidates for the pooling operation.
375 template <class T>
376 class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
377  public:
378   typedef GPUDevice Device;
379 
AvgPoolingGradOp(OpKernelConstruction * context)380   explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
381     string data_format;
382     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
383     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
384                 errors::InvalidArgument("Invalid data format"));
385     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
386     OP_REQUIRES(context, ksize_.size() == 4,
387                 errors::InvalidArgument("Sliding window ksize field must "
388                                         "specify 4 dimensions"));
389     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
390     OP_REQUIRES(context, stride_.size() == 4,
391                 errors::InvalidArgument("Sliding window strides field must "
392                                         "specify 4 dimensions"));
393     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
394     const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
395     const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
396     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
397                 errors::Unimplemented(
398                     "Pooling is not yet supported on the batch dimension."));
399   }
400 
Compute(OpKernelContext * context)401   void Compute(OpKernelContext* context) override {
402     const Tensor& tensor_in_shape = context->input(0);
403     const Tensor& out_backprop = context->input(1);
404     // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
405     OP_REQUIRES(
406         context,
407         tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
408         errors::InvalidArgument("out_backprop must be 1-dimensional and 4 "
409                                 "elements"));
410     // For avgpooling, out_backprop should have 4 dimensions.
411     OP_REQUIRES(context, out_backprop.dims() == 4,
412                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
413 
414     TensorShape output_shape;
415     auto shape_vec = tensor_in_shape.vec<int32>();
416     for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
417       output_shape.AddDim(shape_vec(i));
418     }
419 
420     DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
421                                  ksize_, stride_, padding_, data_format_,
422                                  nullptr, nullptr, out_backprop, output_shape,
423                                  /*propagate_nans=*/false);
424   }
425 
426  private:
427   std::vector<int32> ksize_;
428   std::vector<int32> stride_;
429   Padding padding_;
430   TensorFormat data_format_;
431 };
432 
433 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
434                             .Device(DEVICE_GPU)
435                             .TypeConstraint<double>("T")
436                             .HostMemory("orig_input_shape")
437                             .Label("cudnn"),
438                         AvgPoolingGradOp<GPUDevice, double>);
439 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
440                             .Device(DEVICE_GPU)
441                             .TypeConstraint<float>("T")
442                             .HostMemory("orig_input_shape")
443                             .Label("cudnn"),
444                         AvgPoolingGradOp<GPUDevice, float>);
445 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
446                             .Device(DEVICE_GPU)
447                             .TypeConstraint<Eigen::half>("T")
448                             .HostMemory("orig_input_shape")
449                             .Label("cudnn"),
450                         AvgPoolingGradOp<GPUDevice, Eigen::half>);
451 
452 // A custom GPU kernel based AvgPoolingGrad implementation. It includes the
453 // padding as the candidates for the pooling operation.
454 template <class T>
455 class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
456  public:
457   typedef GPUDevice Device;
458 
AvgPoolingGradOpCustomGPUKernel(OpKernelConstruction * context)459   explicit AvgPoolingGradOpCustomGPUKernel(OpKernelConstruction* context)
460       : OpKernel(context) {
461     string data_format;
462     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
463     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
464                 errors::InvalidArgument("Invalid data format"));
465     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
466     OP_REQUIRES(context, ksize_.size() == 4,
467                 errors::InvalidArgument("Sliding window ksize field must "
468                                         "specify 4 dimensions"));
469     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
470     OP_REQUIRES(context, stride_.size() == 4,
471                 errors::InvalidArgument("Sliding window strides field must "
472                                         "specify 4 dimensions"));
473     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
474     const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
475     const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
476     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
477                 errors::Unimplemented(
478                     "Pooling is not yet supported on the batch dimension."));
479   }
480 
Compute(OpKernelContext * context)481   void Compute(OpKernelContext* context) override {
482     const Tensor& tensor_in_shape = context->input(0);
483     const Tensor& out_backprop = context->input(1);
484     // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
485     OP_REQUIRES(
486         context,
487         tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
488         errors::InvalidArgument("out_backprop must be 1-dimensional and 4 "
489                                 "elements"));
490     // For avgpooling, out_backprop should have 4 dimensions.
491     OP_REQUIRES(context, out_backprop.dims() == 4,
492                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
493     TensorShape output_shape;
494     auto shape_vec = tensor_in_shape.vec<int32>();
495     for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
496       output_shape.AddDim(shape_vec(i));
497     }
498 
499     if (data_format_ == FORMAT_NHWC) {
500       const int64 out_backprop_batch = out_backprop.dim_size(0);
501       const int64 out_backprop_rows = out_backprop.dim_size(1);
502       const int64 out_backprop_cols = out_backprop.dim_size(2);
503       const int64 out_backprop_depth = out_backprop.dim_size(3);
504 
505       const int64 in_rows = output_shape.dim_size(1);
506       const int64 in_cols = output_shape.dim_size(2);
507       Tensor* output = nullptr;
508       OP_REQUIRES_OK(context,
509                      context->allocate_output(0, output_shape, &output));
510 
511       const int window_rows = ksize_[1];
512       const int window_cols = ksize_[2];
513       const int depth_window = ksize_[3];
514 
515       const int row_stride = stride_[1];
516       const int col_stride = stride_[2];
517 
518       // We (will) use different code for spatial pooling and
519       // non-spatial pooling.
520       //
521       // Spatial pooling is when depth_window = 1
522       OP_REQUIRES(context, depth_window == 1,
523                   errors::Unimplemented("Non-spatial pooling is not "
524                                         "yet supported. Volunteers? :)"));
525 
526       int64 out_height, out_width, pad_rows, pad_cols;
527       OP_REQUIRES_OK(context,
528                      GetWindowedOutputSize(in_rows, window_rows, row_stride,
529                                            padding_, &out_height, &pad_rows));
530       OP_REQUIRES_OK(context,
531                      GetWindowedOutputSize(in_cols, window_cols, col_stride,
532                                            padding_, &out_width, &pad_cols));
533 
534       RunAvePoolBackwardNHWC<T>(out_backprop.flat<T>().data(),  // top_diff
535                                 out_backprop_batch,             // num
536                                 in_rows,                        // height
537                                 in_cols,                        // width
538                                 out_backprop_depth,             // channels
539                                 out_backprop_rows,              // pooled_height
540                                 out_backprop_cols,              // pooled_width
541                                 window_rows,                    // kernel_h
542                                 window_cols,                    // kernel_w
543                                 row_stride,                     // stride_h
544                                 col_stride,                     // stride_w
545                                 pad_rows,                       // pad_t
546                                 pad_cols,                       // pad_l
547                                 output->flat<T>().data(),       // bottom_diff
548                                 context->eigen_gpu_device());   // d
549     } else {
550       DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
551                                    ksize_, stride_, padding_, data_format_,
552                                    nullptr, nullptr, out_backprop, output_shape,
553                                    /*propagate_nans=*/false);
554     }
555   }
556 
557  private:
558   std::vector<int32> ksize_;
559   std::vector<int32> stride_;
560   Padding padding_;
561   TensorFormat data_format_;
562 };
563 
564 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
565                             .Device(DEVICE_GPU)
566                             .TypeConstraint<float>("T")
567                             .HostMemory("orig_input_shape"),
568                         AvgPoolingGradOpCustomGPUKernel<float>);
569 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
570                             .Device(DEVICE_GPU)
571                             .TypeConstraint<double>("T")
572                             .HostMemory("orig_input_shape"),
573                         AvgPoolingGradOpCustomGPUKernel<double>);
574 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
575                             .Device(DEVICE_GPU)
576                             .TypeConstraint<Eigen::half>("T")
577                             .HostMemory("orig_input_shape"),
578                         AvgPoolingGradOpCustomGPUKernel<Eigen::half>);
579 
580 #endif  // GOOGLE_CUDA
581 
582 }  // namespace tensorflow
583