• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #define EIGEN_USE_THREADS
16 
17 #include "tensorflow/core/kernels/pooling_ops_3d.h"
18 
19 #include <array>
20 
21 #include "third_party/eigen3/Eigen/Core"
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/kernel_shape_util.h"
24 #include "tensorflow/core/framework/numeric_op.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/register_types.h"
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/framework/tensor_shape.h"
29 #include "tensorflow/core/framework/tensor_slice.h"
30 #include "tensorflow/core/kernels/eigen_pooling.h"
31 #include "tensorflow/core/kernels/ops_util.h"
32 #include "tensorflow/core/lib/core/errors.h"
33 #include "tensorflow/core/util/padding.h"
34 #include "tensorflow/core/util/tensor_format.h"
35 #include "tensorflow/core/util/work_sharder.h"
36 
37 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
39 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
40 #endif
41 
42 
43 namespace tensorflow {
44 
45 typedef Eigen::ThreadPoolDevice CPUDevice;
46 typedef Eigen::GpuDevice GPUDevice;
47 
Pool3dParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const TensorShape & tensor_in_shape)48 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
49                                    const std::vector<int32>& ksize,
50                                    const std::vector<int32>& stride,
51                                    Padding padding, TensorFormat data_format,
52                                    const TensorShape& tensor_in_shape) {
53   // For maxpooling, tensor_in should have 4 dimensions.
54   OP_REQUIRES(context, tensor_in_shape.dims() == 5,
55               errors::InvalidArgument("tensor_in must be 4-dimensional"));
56 
57   this->data_format = data_format;
58   depth = GetTensorDim(tensor_in_shape, data_format, 'C');
59   tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
60   tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
61   tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
62   tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
63   window_planes = GetTensorDim(ksize, data_format, '0');
64   window_rows = GetTensorDim(ksize, data_format, '1');
65   window_cols = GetTensorDim(ksize, data_format, '2');
66   depth_window = GetTensorDim(ksize, data_format, 'C');
67   plane_stride = GetTensorDim(stride, data_format, '0');
68   row_stride = GetTensorDim(stride, data_format, '1');
69   col_stride = GetTensorDim(stride, data_format, '2');
70   depth_stride = GetTensorDim(stride, data_format, 'C');
71 
72   // We only support 3D pooling across plane/width/height. Depthwise
73   // pooling is not supported.
74   OP_REQUIRES(
75       context, depth_window == 1 && depth_stride == 1,
76       errors::Unimplemented(
77           "Pooling3d only supports pooling across plane/width/height."));
78 
79   OP_REQUIRES_OK(context, GetWindowedOutputSize(tensor_in_planes, window_planes,
80                                                 plane_stride, padding,
81                                                 &out_plane, &pad_planes));
82   OP_REQUIRES_OK(context,
83                  GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
84                                        padding, &out_height, &pad_rows));
85   OP_REQUIRES_OK(context,
86                  GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
87                                        padding, &out_width, &pad_cols));
88 }
89 
forward_output_shape()90 TensorShape Pool3dParameters::forward_output_shape() {
91   return ShapeFromFormat(data_format, tensor_in_batch,
92                          {{out_plane, out_height, out_width}}, depth);
93 }
94 
95 template <typename T>
96 struct LaunchPoolingOp<CPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp97   static void launch(OpKernelContext* context, const Tensor& tensor_in,
98                      const std::array<int64, 3>& window,
99                      const std::array<int64, 3>& stride,
100                      const std::array<int64, 3>& padding,
101                      TensorFormat data_format, Padding padding_type,
102                      Tensor* output) {
103     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
104         Eigen::CuboidAvgPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
105                                 window[2], stride[0], stride[1], stride[2],
106                                 BrainPadding2EigenPadding(padding_type));
107   }
108 };
109 
110 template <typename T>
111 struct LaunchPoolingOp<CPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp112   static void launch(OpKernelContext* context, const Tensor& tensor_in,
113                      const std::array<int64, 3>& window,
114                      const std::array<int64, 3>& stride,
115                      const std::array<int64, 3>& padding,
116                      TensorFormat data_format, Padding padding_type,
117                      Tensor* output) {
118     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
119         Eigen::CuboidMaxPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
120                                 window[2], stride[0], stride[1], stride[2],
121                                 BrainPadding2EigenPadding(padding_type));
122   }
123 };
124 
125 template <typename Device, typename T, PoolingType Type>
126 class Pooling3DOp : public UnaryOp<T> {
127  public:
Pooling3DOp(OpKernelConstruction * context)128   explicit Pooling3DOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
129     string data_format;
130     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
131     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
132                 errors::InvalidArgument("Invalid data format"));
133     if (context->device_type() == DEVICE_CPU) {
134       OP_REQUIRES(
135           context, data_format_ == FORMAT_NHWC,
136           errors::InvalidArgument("Default Pooling3DOp only supports NDHWC ",
137                                   "on device type ",
138                                   DeviceTypeString(context->device_type())));
139     }
140     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
141     OP_REQUIRES(context, ksize_.size() == 5,
142                 errors::InvalidArgument("Sliding window ksize field must "
143                                         "specify 5 dimensions"));
144     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
145     OP_REQUIRES(context, stride_.size() == 5,
146                 errors::InvalidArgument("Sliding window stride field must "
147                                         "specify 5 dimensions"));
148     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
149     OP_REQUIRES(context,
150                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
151                  GetTensorDim(stride_, data_format_, 'N') == 1),
152                 errors::Unimplemented(
153                     "Pooling is not yet supported on the batch dimension."));
154     OP_REQUIRES(context,
155                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
156                  GetTensorDim(stride_, data_format_, 'C') == 1),
157                 errors::Unimplemented(
158                     "Pooling is not yet supported on the depth dimension."));
159   }
160 
Compute(OpKernelContext * context)161   void Compute(OpKernelContext* context) override {
162     const Tensor& tensor_in = context->input(0);
163 
164     OP_REQUIRES(context, tensor_in.dims() == 5,
165                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
166     const int64_t depth = GetTensorDim(tensor_in, data_format_, 'C');
167     const int64_t in_batch = GetTensorDim(tensor_in, data_format_, 'N');
168 
169     // Dimension order for these arrays is: x, y, z.
170     std::array<int64, 3> input_size{
171         {GetTensorDim(tensor_in, data_format_, '2'),
172          GetTensorDim(tensor_in, data_format_, '1'),
173          GetTensorDim(tensor_in, data_format_, '0')}};
174     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
175                                  GetTensorDim(ksize_, data_format_, '1'),
176                                  GetTensorDim(ksize_, data_format_, '0')}};
177     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
178                                  GetTensorDim(stride_, data_format_, '1'),
179                                  GetTensorDim(stride_, data_format_, '0')}};
180     std::array<int64, 3> padding, out;
181 
182     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
183                                             padding_, &out, &padding));
184 
185     TensorShape out_shape = ShapeFromFormat(data_format_, in_batch,
186                                             {{out[2], out[1], out[0]}}, depth);
187     Tensor* output;
188     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
189     if (out_shape.num_elements() == 0) return;
190     LaunchPoolingOp<Device, T, Type>::launch(context, tensor_in, window, stride,
191                                              padding, data_format_, padding_,
192                                              output);
193   }
194 
195  private:
196   std::vector<int32> ksize_;
197   std::vector<int32> stride_;
198   Padding padding_;
199   TensorFormat data_format_;
200 };
201 
202 template <typename T>
203 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp204   static void launch(OpKernelContext* context, const Tensor& tensor_in,
205                      const Tensor& tensor_out, const Tensor& out_backprop,
206                      const std::array<int64, 3>& window,
207                      const std::array<int64, 3>& stride,
208                      const std::array<int64, 3>& out,
209                      const std::array<int64, 3>& padding,
210                      TensorFormat data_format, Tensor* output) {
211     output->flat<T>().setZero();
212     for (int64_t p = 0; p < out_backprop.dim_size(3); ++p) {
213       // Calculate broadcast size for planes/rows/cols. For SAME padding,
214       // current index could be in the padding area, and
215       //   p * stride_planes + window_planes
216       // could be beyond the input tensor's boundary. In such cases, change
217       // the starting index and reduce the broadcast size.
218       //
219       // The same procedure is repeated for every spatial dimension in the
220       // nested loops below.
221       int pindex, psize;
222       std::array<int64, 3> input_size{{tensor_in.dim_size(3),
223                                        tensor_in.dim_size(2),
224                                        tensor_in.dim_size(1)}};
225       OP_REQUIRES_OK(context,
226                      GetBroadcastSize(p, input_size[0], window[0], stride[0],
227                                       padding[0], &pindex, &psize));
228       for (int64_t r = 0; r < out_backprop.dim_size(2); ++r) {
229         int rindex, rsize;
230         OP_REQUIRES_OK(context,
231                        GetBroadcastSize(r, input_size[1], window[1], stride[1],
232                                         padding[1], &rindex, &rsize));
233         for (int64_t c = 0; c < out_backprop.dim_size(1); ++c) {
234           int cindex, csize;
235           OP_REQUIRES_OK(
236               context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
237                                         padding[2], &cindex, &csize));
238           TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
239           TensorSlice dst{{0, -1},
240                           {cindex, csize},
241                           {rindex, rsize},
242                           {pindex, psize},
243                           {0, -1}};
244           Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
245           Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
246           Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
247           Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
248           src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
249                                      &src_sizes);
250           dst.FillIndicesAndSizes<5>(tensor_in.shape(), &dst_indices,
251                                      &dst_sizes);
252 
253 #if !defined(EIGEN_HAS_INDEX_LIST)
254           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
255 #else
256           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
257                            Eigen::type2index<1>>
258               bcast;
259           bcast.set(1, csize);
260           bcast.set(2, rsize);
261           bcast.set(3, psize);
262 #endif
263 
264           // Slice from tensor_in.
265           Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_in_slice(dst_sizes);
266           tensor_in_slice.device(context->eigen_cpu_device()) =
267               tensor_in.tensor<T, 5>().slice(dst_indices, dst_sizes);
268 
269           // Slice from tensor_out.
270           Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_out_slice(src_sizes);
271           tensor_out_slice.device(context->eigen_cpu_device()) =
272               tensor_out.tensor<T, 5>().slice(src_indices, src_sizes);
273 
274           // Backprop slice.
275           Eigen::Tensor<T, 5, Eigen::RowMajor> out_backprop_slice(src_sizes);
276           out_backprop_slice.device(context->eigen_cpu_device()) =
277               out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
278 
279           // The true backprop slice: if an element is the max, choose
280           // the backprop slice; otherwise set to 0.
281           Eigen::Tensor<T, 5, Eigen::RowMajor> select_slice(dst_sizes);
282           Eigen::Tensor<T, 5, Eigen::RowMajor> mat0(dst_sizes);
283           mat0.setZero();
284           select_slice =
285               ((tensor_in_slice - tensor_out_slice.broadcast(bcast)).abs() <
286                tensor_in_slice.constant(1e-5))
287                   .select(out_backprop_slice.broadcast(bcast), mat0);
288 
289           output->tensor<T, 5>()
290               .slice(dst_indices, dst_sizes)
291               .device(context->eigen_cpu_device()) += select_slice;
292         }
293       }
294     }
295   }
296 };
297 
298 template <class Device, class T>
299 class MaxPooling3dGradOp : public OpKernel {
300  public:
MaxPooling3dGradOp(OpKernelConstruction * context)301   explicit MaxPooling3dGradOp(OpKernelConstruction* context)
302       : OpKernel(context) {
303     string data_format;
304     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
305     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
306                 errors::InvalidArgument("Invalid data format"));
307     if (context->device_type() == DEVICE_CPU) {
308       OP_REQUIRES(
309           context, data_format_ == FORMAT_NHWC,
310           errors::InvalidArgument(
311               "Default MaxPooling3dGradOp only supports NDHWC ",
312               "on device type ", DeviceTypeString(context->device_type())));
313     }
314     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
315     OP_REQUIRES(context, ksize_.size() == 5,
316                 errors::InvalidArgument("Sliding window ksize field must "
317                                         "specify 5 dimensions"));
318     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
319     OP_REQUIRES(context, stride_.size() == 5,
320                 errors::InvalidArgument("Sliding window stride field must "
321                                         "specify 5 dimensions"));
322     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
323     OP_REQUIRES(context,
324                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
325                  GetTensorDim(stride_, data_format_, 'N') == 1),
326                 errors::Unimplemented(
327                     "Pooling is not yet supported on the batch dimension."));
328     OP_REQUIRES(context,
329                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
330                  GetTensorDim(stride_, data_format_, 'C') == 1),
331                 errors::Unimplemented(
332                     "Pooling is not yet supported on the depth dimension."));
333   }
334 
Compute(OpKernelContext * context)335   void Compute(OpKernelContext* context) override {
336     const Tensor& tensor_in = context->input(0);
337     const Tensor& tensor_out = context->input(1);
338     const Tensor& out_backprop = context->input(2);
339     OP_REQUIRES(context, tensor_in.dims() == 5,
340                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
341     OP_REQUIRES(context, tensor_out.dims() == 5,
342                 errors::InvalidArgument("tensor_out must be 5-dimensional"));
343     OP_REQUIRES(context, out_backprop.dims() == 5,
344                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
345 
346     const TensorShape& output_shape = tensor_in.shape();
347     Tensor* input_backprop;
348     OP_REQUIRES_OK(context,
349                    context->allocate_output(0, output_shape, &input_backprop));
350     std::array<int64, 3> input_size{
351         {GetTensorDim(output_shape, data_format_, '2'),
352          GetTensorDim(output_shape, data_format_, '1'),
353          GetTensorDim(output_shape, data_format_, '0')}};
354     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
355                                  GetTensorDim(ksize_, data_format_, '1'),
356                                  GetTensorDim(ksize_, data_format_, '0')}};
357     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
358                                  GetTensorDim(stride_, data_format_, '1'),
359                                  GetTensorDim(stride_, data_format_, '0')}};
360     std::array<int64, 3> out, padding;
361 
362     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
363                                             padding_, &out, &padding));
364     LaunchMaxPooling3dGradOp<Device, T>::launch(
365         context, tensor_in, tensor_out, out_backprop, window, stride, out,
366         padding, data_format_, input_backprop);
367   }
368 
369  private:
370   std::vector<int32> ksize_;
371   std::vector<int32> stride_;
372   Padding padding_;
373   TensorFormat data_format_;
374 };
375 
376 template <typename T>
377 struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp378   static void launch(OpKernelContext* context,
379                      const TensorShape& tensor_in_shape,
380                      const Tensor& out_backprop,
381                      const std::array<int64, 3>& window,
382                      const std::array<int64, 3>& stride,
383                      const std::array<int64, 3>& output_shape,
384                      const std::array<int64, 3>& padding,
385                      TensorFormat data_format, Tensor* output) {
386     OP_REQUIRES(
387         context, tensor_in_shape.dim_size(0) == out_backprop.dim_size(0),
388         errors::InvalidArgument(
389             "Expected first dimension of tensor_in_shape and "
390             "out_backprop to match, got ",
391             tensor_in_shape.dim_size(0), " and ", out_backprop.dim_size(0)));
392     OP_REQUIRES(
393         context, tensor_in_shape.dim_size(4) == out_backprop.dim_size(4),
394         errors::InvalidArgument(
395             "Expected last dimension of tensor_in_shape and "
396             "out_backprop to match, got ",
397             tensor_in_shape.dim_size(4), " and ", out_backprop.dim_size(4)));
398 
399     output->flat<T>().setZero();
400     std::array<int64, 3> input_size = {{tensor_in_shape.dim_size(3),
401                                         tensor_in_shape.dim_size(2),
402                                         tensor_in_shape.dim_size(1)}};
403     for (int64_t p = 0; p < out_backprop.dim_size(3); ++p) {
404       // Calculate broadcast size for planes/rows/cols. For SAME padding,
405       // current index could be in the padding area, and
406       //   p * stride_planes + window_planes
407       // could be beyond the input tensor's boundary. In such cases, change
408       // the starting index and reduce the broadcast size.
409       //
410       // The same procedure is repeated for every spatial dimension in the
411       // nested loops below.
412       int pindex, psize;
413       OP_REQUIRES_OK(context,
414                      GetBroadcastSize(p, input_size[0], window[0], stride[0],
415                                       padding[0], &pindex, &psize));
416       for (int64_t r = 0; r < out_backprop.dim_size(2); ++r) {
417         int rindex, rsize;
418         OP_REQUIRES_OK(context,
419                        GetBroadcastSize(r, input_size[1], window[1], stride[1],
420                                         padding[1], &rindex, &rsize));
421         for (int64_t c = 0; c < out_backprop.dim_size(1); ++c) {
422           int cindex, csize;
423           OP_REQUIRES_OK(
424               context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
425                                         padding[2], &cindex, &csize));
426           TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
427           TensorSlice dst{{0, -1},
428                           {cindex, csize},
429                           {rindex, rsize},
430                           {pindex, psize},
431                           {0, -1}};
432           Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
433           Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
434           Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
435           Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
436           src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
437                                      &src_sizes);
438           dst.FillIndicesAndSizes<5>(tensor_in_shape, &dst_indices, &dst_sizes);
439 #if !defined(EIGEN_HAS_INDEX_LIST)
440           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
441 #else
442           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
443                            Eigen::type2index<1>>
444               bcast;
445           bcast.set(1, csize);
446           bcast.set(2, rsize);
447           bcast.set(3, psize);
448 #endif
449           Eigen::Tensor<T, 5, Eigen::RowMajor> slices(src_sizes);
450           slices.device(context->eigen_cpu_device()) =
451               out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
452           // Divide by the size of the actual patch (psize * rsize * csize).
453           float divide_size = rsize * csize * psize * 1.0f;
454           slices *= slices.constant(1.0f / divide_size);
455 
456           output->tensor<T, 5>()
457               .slice(dst_indices, dst_sizes)
458               .device(context->eigen_cpu_device()) += slices.broadcast(bcast);
459         }
460       }
461     }
462   }
463 };
464 
465 template <class Device, class T>
466 class AvgPooling3dGradOp : public OpKernel {
467  public:
AvgPooling3dGradOp(OpKernelConstruction * context)468   explicit AvgPooling3dGradOp(OpKernelConstruction* context)
469       : OpKernel(context) {
470     string data_format;
471     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
472     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
473                 errors::InvalidArgument("Invalid data format"));
474     if (context->device_type() == DEVICE_CPU) {
475       OP_REQUIRES(
476           context, data_format_ == FORMAT_NHWC,
477           errors::InvalidArgument(
478               "Default AvgPooling3dGradOp only supports NDHWC ",
479               "on device type ", DeviceTypeString(context->device_type())));
480     }
481     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
482     OP_REQUIRES(context, ksize_.size() == 5,
483                 errors::InvalidArgument("Sliding window ksize field must "
484                                         "specify 5 dimensions"));
485     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
486     OP_REQUIRES(context, stride_.size() == 5,
487                 errors::InvalidArgument("Sliding window stride field must "
488                                         "specify 5 dimensions"));
489     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
490     OP_REQUIRES(context,
491                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
492                  GetTensorDim(stride_, data_format_, 'N') == 1),
493                 errors::Unimplemented(
494                     "Pooling is not yet supported on the batch dimension."));
495     OP_REQUIRES(context,
496                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
497                  GetTensorDim(stride_, data_format_, 'C') == 1),
498                 errors::Unimplemented(
499                     "Pooling is not yet supported on the depth dimension."));
500   }
501 
Compute(OpKernelContext * context)502   void Compute(OpKernelContext* context) override {
503     const Tensor& tensor_in_shape = context->input(0);
504     const Tensor& out_backprop = context->input(1);
505     OP_REQUIRES(
506         context,
507         tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 5,
508         errors::InvalidArgument("tensor_in must be 1-dimensional and 5 "
509                                 "elements"));
510     OP_REQUIRES(context, out_backprop.dims() == 5,
511                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
512 
513     TensorShape output_shape;
514     auto shape_vec = tensor_in_shape.vec<int32>();
515     for (int64_t i = 0; i < tensor_in_shape.NumElements(); ++i) {
516       output_shape.AddDim(shape_vec(i));
517     }
518 
519     Tensor* output;
520     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
521 
522     // Dimension order for these arrays is x, y, z.
523     std::array<int64, 3> input_size{
524         {GetTensorDim(output_shape, data_format_, '2'),
525          GetTensorDim(output_shape, data_format_, '1'),
526          GetTensorDim(output_shape, data_format_, '0')}};
527     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
528                                  GetTensorDim(ksize_, data_format_, '1'),
529                                  GetTensorDim(ksize_, data_format_, '0')}};
530     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
531                                  GetTensorDim(stride_, data_format_, '1'),
532                                  GetTensorDim(stride_, data_format_, '0')}};
533     std::array<int64, 3> padding, out;
534 
535     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
536                                             padding_, &out, &padding));
537 
538     LaunchAvgPooling3dGradOp<Device, T>::launch(
539         context, output_shape, out_backprop, window, stride, out, padding,
540         data_format_, output);
541   }
542 
543  private:
544   std::vector<int32> ksize_;
545   std::vector<int32> stride_;
546   Padding padding_;
547   TensorFormat data_format_;
548 };
549 
550 template <typename T>
551 struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp552   static void launch(OpKernelContext* context, const Pool3dParameters& params,
553                      const Tensor& tensor_in, const Tensor& tensor_out,
554                      const Tensor& tensor_top_diff,
555                      Tensor* tensor_bottom_diff) {
556     OP_REQUIRES(
557         context, params.data_format == FORMAT_NHWC,
558         errors::InvalidArgument("Default MaxPooling3dGradGradOp only supports",
559                                 "NDHWC on CPU device type"));
560 
561     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
562         ConstEigenMatrixMap;
563     typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
564         EigenMatrixMap;
565 
566     ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
567                                params.tensor_in_planes * params.tensor_in_cols *
568                                    params.tensor_in_rows *
569                                    params.tensor_in_batch);
570     ConstEigenMatrixMap out_mat(tensor_out.flat<T>().data(), params.depth,
571                                 params.out_plane * params.out_width *
572                                     params.out_height * params.tensor_in_batch);
573     ConstEigenMatrixMap top_diff_mat(
574         tensor_top_diff.flat<T>().data(), params.depth,
575         params.tensor_in_planes * params.tensor_in_cols *
576             params.tensor_in_rows * params.tensor_in_batch);
577     EigenMatrixMap bottom_diff_mat(
578         tensor_bottom_diff->flat<T>().data(), params.depth,
579         params.out_plane * params.out_width * params.out_height *
580             params.tensor_in_batch);
581 
582     const DeviceBase::CpuWorkerThreads& worker_threads =
583         *(context->device()->tensorflow_cpu_worker_threads());
584 
585     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
586                      int64_t start, int64_t limit) {
587       const int32_t depth = params.depth;
588       const int32_t in_planes = params.tensor_in_planes;
589       const int32_t in_rows = params.tensor_in_rows;
590       const int32_t in_cols = params.tensor_in_cols;
591       const int32_t pad_planes = params.pad_planes;
592       const int32_t pad_rows = params.pad_rows;
593       const int32_t pad_cols = params.pad_cols;
594       const int32_t window_planes = params.window_planes;
595       const int32_t window_rows = params.window_rows;
596       const int32_t window_cols = params.window_cols;
597       const int32_t plane_stride = params.plane_stride;
598       const int32_t row_stride = params.row_stride;
599       const int32_t col_stride = params.col_stride;
600       const int32_t out_plane = params.out_plane;
601       const int32_t out_height = params.out_height;
602       const int32_t out_width = params.out_width;
603 
604       {
605         // Initializes the output grad backprop tensor with 0.
606         const int32_t output_image_size =
607             out_plane * out_height * out_width * params.depth;
608         EigenMatrixMap bottom_diff_shard(
609             bottom_diff_mat.data() + start * output_image_size, 1,
610             (limit - start) * output_image_size);
611         bottom_diff_shard.setZero();
612       }
613 
614       for (int b = start; b < limit; ++b) {
615         for (int pp = 0; pp < out_plane; ++pp) {
616           for (int ph = 0; ph < out_height; ++ph) {
617             for (int pw = 0; pw < out_width; ++pw) {
618               // (p_start, p_end) * (h_start, h_end) * (w_start, w_end) is the
619               // range that the input vector projects to.
620               int p_start = pp * plane_stride - pad_planes;
621               const int p_end = std::min(p_start + window_planes, in_planes);
622               int h_start = ph * row_stride - pad_rows;
623               const int h_end = std::min(h_start + window_rows, in_rows);
624               int w_start = pw * col_stride - pad_cols;
625               const int w_end = std::min(w_start + window_cols, in_cols);
626               p_start = std::max(p_start, 0);
627               h_start = std::max(h_start, 0);
628               w_start = std::max(w_start, 0);
629               const int out_index =
630                   ((b * out_plane + pp) * out_height + ph) * out_width + pw;
631               // Find value corresponding to the input maximum in top_diff.
632               for (int d = 0; d < depth; ++d) {
633                 const T& output_ref = out_mat.coeffRef(d, out_index);
634                 bool should_stop = false;
635                 for (int p = p_start; p < p_end && !should_stop; ++p) {
636                   for (int h = h_start; h < h_end && !should_stop; ++h) {
637                     for (int w = w_start; w < w_end && !should_stop; ++w) {
638                       const int in_index =
639                           ((b * in_planes + p) * in_rows + h) * in_cols + w;
640                       const T& input_ref = in_mat.coeffRef(d, in_index);
641                       if (output_ref == input_ref) {
642                         T& bottom_diff_ref =
643                             bottom_diff_mat.coeffRef(d, out_index);
644                         bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
645                         should_stop = true;
646                       }
647                     }
648                   }
649                 }
650               }
651             }
652           }
653         }
654       }
655     };
656     const int64_t shard_cost =
657         params.out_plane * params.out_height * params.out_width * params.depth *
658         params.window_planes * params.window_rows * params.window_cols;
659     Shard(worker_threads.num_threads, worker_threads.workers,
660           params.tensor_in_batch, shard_cost, shard);
661   }
662 };
663 
664 template <class Device, class T>
665 class MaxPooling3dGradGradOp : public OpKernel {
666  public:
MaxPooling3dGradGradOp(OpKernelConstruction * context)667   explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
668       : OpKernel(context) {
669     string data_format;
670     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
671     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
672                 errors::InvalidArgument("Invalid data format"));
673     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
674     OP_REQUIRES(context, ksize_.size() == 5,
675                 errors::InvalidArgument("Sliding window ksize field must "
676                                         "specify 5 dimensions"));
677     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
678     OP_REQUIRES(context, stride_.size() == 5,
679                 errors::InvalidArgument("Sliding window strides field must "
680                                         "specify 5 dimensions"));
681     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
682     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
683                 errors::Unimplemented(
684                     "Pooling is not yet supported on the batch dimension."));
685     const int32_t ksize_c = GetTensorDim(ksize_, data_format_, 'C');
686     const int32_t stride_c = GetTensorDim(stride_, data_format_, 'C');
687     OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
688                 errors::Unimplemented("MaxPooling3dGradGrad is not yet "
689                                       "supported on the depth dimension."));
690   }
691 
Compute(OpKernelContext * context)692   void Compute(OpKernelContext* context) override {
693     const Tensor& tensor_in = context->input(0);
694     const Tensor& tensor_out = context->input(1);
695     const Tensor& out_grad_backprop = context->input(2);
696 
697     // For maxpooling3d, tensor_in should have 5 dimensions.
698     OP_REQUIRES(context, tensor_in.dims() == 5,
699                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
700     OP_REQUIRES(context, tensor_out.dims() == 5,
701                 errors::InvalidArgument("tensor_out must be 5-dimensional"));
702     // For maxpooling3d, out_grad_backprop should have 5 dimensions.
703     OP_REQUIRES(
704         context, out_grad_backprop.dims() == 5,
705         errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
706 
707     Pool3dParameters params{context,  ksize_,       stride_,
708                             padding_, data_format_, tensor_in.shape()};
709     if (!context->status().ok()) return;  // params is invalid
710 
711     Tensor* output = nullptr;
712     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
713                                 {2}, 0, tensor_out.shape(), &output));
714 
715     // Given access patterns in LaunchMaxPooling3dGradGradOp, these tensors must
716     // have elements.
717     OP_REQUIRES(context, tensor_in.NumElements() > 0,
718                 errors::InvalidArgument("received empty tensor tensor_in: ",
719                                         tensor_in.DebugString()));
720     OP_REQUIRES(context, tensor_out.NumElements() > 0,
721                 errors::InvalidArgument("received empty tensor tensor_out: ",
722                                         tensor_out.DebugString()));
723     OP_REQUIRES(
724         context, out_grad_backprop.NumElements() > 0,
725         errors::InvalidArgument("received empty tensor out_grad_backprop: ",
726                                 out_grad_backprop.DebugString()));
727     OP_REQUIRES(context,
728                 tensor_in.NumElements() == out_grad_backprop.NumElements(),
729                 errors::InvalidArgument("tensor_in and out_grad_backprop must "
730                                         "have same number of elements, got <",
731                                         tensor_in.DebugString(), "> and <",
732                                         out_grad_backprop.DebugString(), ">"));
733     OP_REQUIRES(
734         context, tensor_out.NumElements() == output->NumElements(),
735         errors::InvalidArgument(
736             "tensor_out and output must have same number of elements, got <",
737             tensor_out.DebugString(), "> and <", output->DebugString(), ">"));
738 
739     LaunchMaxPooling3dGradGradOp<Device, T>::launch(
740         context, params, tensor_in, tensor_out, out_grad_backprop, output);
741   }
742 
743  private:
744   std::vector<int32> ksize_;
745   std::vector<int32> stride_;
746   Padding padding_;
747   TensorFormat data_format_;
748 };
749 
750 #define REGISTER_KERNELS(D, T)                                             \
751   REGISTER_KERNEL_BUILDER(                                                 \
752       Name("MaxPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
753       Pooling3DOp<D##Device, T, MAX>);                                     \
754   REGISTER_KERNEL_BUILDER(Name("MaxPool3DGrad")                            \
755                               .Device(DEVICE_##D)                          \
756                               .TypeConstraint<T>("T")                      \
757                               .TypeConstraint<T>("TInput"),                \
758                           MaxPooling3dGradOp<D##Device, T>);               \
759   REGISTER_KERNEL_BUILDER(                                                 \
760       Name("MaxPool3DGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
761       MaxPooling3dGradGradOp<D##Device, T>);                               \
762   REGISTER_KERNEL_BUILDER(                                                 \
763       Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
764       Pooling3DOp<D##Device, T, AVG>);                                     \
765   REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")                            \
766                               .Device(DEVICE_##D)                          \
767                               .TypeConstraint<T>("T")                      \
768                               .HostMemory("orig_input_shape"),             \
769                           AvgPooling3dGradOp<D##Device, T>);
770 
771 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
772 TF_CALL_float(REGISTER_CPU_KERNELS);
773 #undef REGISTER_CPU_KERNELS
774 
775 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
776 
777 template <typename T>
778 struct LaunchPoolingOp<GPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp779   static void launch(OpKernelContext* context, const Tensor& tensor_in,
780                      const std::array<int64, 3>& window,
781                      const std::array<int64, 3>& stride,
782                      const std::array<int64, 3>& padding,
783                      TensorFormat data_format, Padding padding_type,
784                      Tensor* output) {
785     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
786                                stride, padding, data_format, tensor_in, output);
787   }
788 };
789 
790 template <typename T>
791 struct LaunchPoolingOp<GPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp792   static void launch(OpKernelContext* context, const Tensor& tensor_in,
793                      const std::array<int64, 3>& window,
794                      const std::array<int64, 3>& stride,
795                      const std::array<int64, 3>& padding,
796                      TensorFormat data_format, Padding padding_type,
797                      Tensor* output) {
798     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
799                                stride, padding, data_format, tensor_in, output);
800   }
801 };
802 
803 template <typename T>
804 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp805   static void launch(OpKernelContext* context, const Tensor& tensor_in,
806                      const Tensor& tensor_out, const Tensor& out_backprop,
807                      const std::array<int64, 3>& window,
808                      const std::array<int64, 3>& stride,
809                      const std::array<int64, 3>& out,
810                      const std::array<int64, 3>& padding,
811                      TensorFormat data_format, Tensor* input_backprop) {
812     const TensorShape output_shape = tensor_in.shape();
813     DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
814                                    window, stride, padding, out, data_format,
815                                    out_backprop, output_shape, &tensor_in,
816                                    &tensor_out, input_backprop);
817   }
818 };
819 
820 template <typename T>
821 struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp822   static void launch(OpKernelContext* context,
823                      const TensorShape& tensor_in_shape,
824                      const Tensor& out_backprop,
825                      const std::array<int64, 3>& window,
826                      const std::array<int64, 3>& stride,
827                      const std::array<int64, 3>& out,
828                      const std::array<int64, 3>& padding,
829                      TensorFormat data_format, Tensor* output) {
830     DnnPooling3dGradOp<T>::Compute(
831         context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
832         data_format, out_backprop, tensor_in_shape, nullptr, nullptr, output);
833   }
834 };
835 
836 template <typename T>
837 struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp838   static void launch(OpKernelContext* context, const Pool3dParameters& params,
839                      const Tensor& tensor_in, const Tensor& tensor_out,
840                      const Tensor& tensor_top_diff,
841                      Tensor* tensor_bottom_diff) {
842     bool status = functor::MaxPool3dGradBackward<T>()(
843         params.data_format, tensor_in.flat<T>().data(),
844         tensor_out.flat<T>().data(), params.tensor_in_batch, params.out_plane,
845         params.out_height, params.out_width, params.depth,
846         params.tensor_in_planes, params.tensor_in_rows, params.tensor_in_cols,
847         params.window_planes, params.window_rows, params.window_cols,
848         params.plane_stride, params.row_stride, params.col_stride,
849         params.pad_planes, params.pad_rows, params.pad_cols,
850         tensor_top_diff.flat<T>().data(), tensor_bottom_diff->flat<T>().data(),
851         context->eigen_gpu_device());
852     if (!status) {
853       context->SetStatus(
854           errors::Internal("Failed launching MaxPool3dGradBackward"));
855     }
856   }
857 };
858 
859 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
860 TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
861 #undef REGISTER_GPU_KERNELS
862 
863 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
864 
865 
866 #undef REGISTER_KERNELS
867 
868 }  // namespace tensorflow
869