• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #define EIGEN_USE_THREADS
16 
17 #include "tensorflow/core/kernels/pooling_ops_3d.h"
18 
19 #include <array>
20 
21 #include "third_party/eigen3/Eigen/Core"
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/kernel_shape_util.h"
24 #include "tensorflow/core/framework/numeric_op.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/register_types.h"
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/framework/tensor_shape.h"
29 #include "tensorflow/core/framework/tensor_slice.h"
30 #include "tensorflow/core/kernels/eigen_pooling.h"
31 #include "tensorflow/core/kernels/ops_util.h"
32 #include "tensorflow/core/lib/core/errors.h"
33 #include "tensorflow/core/util/padding.h"
34 #include "tensorflow/core/util/tensor_format.h"
35 #include "tensorflow/core/util/work_sharder.h"
36 
37 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
39 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
40 #endif
41 
42 
43 namespace tensorflow {
44 
45 typedef Eigen::ThreadPoolDevice CPUDevice;
46 typedef Eigen::GpuDevice GPUDevice;
47 
Pool3dParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const TensorShape & tensor_in_shape)48 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
49                                    const std::vector<int32>& ksize,
50                                    const std::vector<int32>& stride,
51                                    Padding padding, TensorFormat data_format,
52                                    const TensorShape& tensor_in_shape) {
53   // For maxpooling, tensor_in should have 4 dimensions.
54   OP_REQUIRES(context, tensor_in_shape.dims() == 5,
55               errors::InvalidArgument("tensor_in must be 4-dimensional"));
56 
57   this->data_format = data_format;
58   depth = GetTensorDim(tensor_in_shape, data_format, 'C');
59   tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
60   tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
61   tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
62   tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
63   window_planes = GetTensorDim(ksize, data_format, '0');
64   window_rows = GetTensorDim(ksize, data_format, '1');
65   window_cols = GetTensorDim(ksize, data_format, '2');
66   depth_window = GetTensorDim(ksize, data_format, 'C');
67   plane_stride = GetTensorDim(stride, data_format, '0');
68   row_stride = GetTensorDim(stride, data_format, '1');
69   col_stride = GetTensorDim(stride, data_format, '2');
70   depth_stride = GetTensorDim(stride, data_format, 'C');
71 
72   // We only support 3D pooling across plane/width/height. Depthwise
73   // pooling is not supported.
74   OP_REQUIRES(
75       context, depth_window == 1 && depth_stride == 1,
76       errors::Unimplemented(
77           "Pooling3d only supports pooling across plane/width/height."));
78 
79   OP_REQUIRES_OK(context, GetWindowedOutputSize(tensor_in_planes, window_planes,
80                                                 plane_stride, padding,
81                                                 &out_plane, &pad_planes));
82   OP_REQUIRES_OK(context,
83                  GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
84                                        padding, &out_height, &pad_rows));
85   OP_REQUIRES_OK(context,
86                  GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
87                                        padding, &out_width, &pad_cols));
88 }
89 
forward_output_shape()90 TensorShape Pool3dParameters::forward_output_shape() {
91   return ShapeFromFormat(data_format, tensor_in_batch,
92                          {{out_plane, out_height, out_width}}, depth);
93 }
94 
95 template <typename T>
96 struct LaunchPoolingOp<CPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp97   static void launch(OpKernelContext* context, const Tensor& tensor_in,
98                      const std::array<int64, 3>& window,
99                      const std::array<int64, 3>& stride,
100                      const std::array<int64, 3>& padding,
101                      TensorFormat data_format, Padding padding_type,
102                      Tensor* output) {
103     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
104         Eigen::CuboidAvgPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
105                                 window[2], stride[0], stride[1], stride[2],
106                                 BrainPadding2EigenPadding(padding_type));
107   }
108 };
109 
110 template <typename T>
111 struct LaunchPoolingOp<CPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp112   static void launch(OpKernelContext* context, const Tensor& tensor_in,
113                      const std::array<int64, 3>& window,
114                      const std::array<int64, 3>& stride,
115                      const std::array<int64, 3>& padding,
116                      TensorFormat data_format, Padding padding_type,
117                      Tensor* output) {
118     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
119         Eigen::CuboidMaxPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
120                                 window[2], stride[0], stride[1], stride[2],
121                                 BrainPadding2EigenPadding(padding_type));
122   }
123 };
124 
125 template <typename Device, typename T, PoolingType Type>
126 class Pooling3DOp : public UnaryOp<T> {
127  public:
Pooling3DOp(OpKernelConstruction * context)128   explicit Pooling3DOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
129     string data_format;
130     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
131     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
132                 errors::InvalidArgument("Invalid data format"));
133     if (context->device_type() == DEVICE_CPU) {
134       OP_REQUIRES(
135           context, data_format_ == FORMAT_NHWC,
136           errors::InvalidArgument("Default Pooling3DOp only supports NDHWC ",
137                                   "on device type ",
138                                   DeviceTypeString(context->device_type())));
139     }
140     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
141     OP_REQUIRES(context, ksize_.size() == 5,
142                 errors::InvalidArgument("Sliding window ksize field must "
143                                         "specify 5 dimensions"));
144     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
145     OP_REQUIRES(context, stride_.size() == 5,
146                 errors::InvalidArgument("Sliding window stride field must "
147                                         "specify 5 dimensions"));
148     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
149     OP_REQUIRES(context,
150                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
151                  GetTensorDim(stride_, data_format_, 'N') == 1),
152                 errors::Unimplemented(
153                     "Pooling is not yet supported on the batch dimension."));
154     OP_REQUIRES(context,
155                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
156                  GetTensorDim(stride_, data_format_, 'C') == 1),
157                 errors::Unimplemented(
158                     "Pooling is not yet supported on the depth dimension."));
159   }
160 
Compute(OpKernelContext * context)161   void Compute(OpKernelContext* context) override {
162     const Tensor& tensor_in = context->input(0);
163 
164     OP_REQUIRES(context, tensor_in.dims() == 5,
165                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
166     const int64 depth = GetTensorDim(tensor_in, data_format_, 'C');
167     const int64 in_batch = GetTensorDim(tensor_in, data_format_, 'N');
168 
169     // Dimension order for these arrays is: x, y, z.
170     std::array<int64, 3> input_size{
171         {GetTensorDim(tensor_in, data_format_, '2'),
172          GetTensorDim(tensor_in, data_format_, '1'),
173          GetTensorDim(tensor_in, data_format_, '0')}};
174     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
175                                  GetTensorDim(ksize_, data_format_, '1'),
176                                  GetTensorDim(ksize_, data_format_, '0')}};
177     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
178                                  GetTensorDim(stride_, data_format_, '1'),
179                                  GetTensorDim(stride_, data_format_, '0')}};
180     std::array<int64, 3> padding, out;
181 
182     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
183                                             padding_, &out, &padding));
184 
185     TensorShape out_shape = ShapeFromFormat(data_format_, in_batch,
186                                             {{out[2], out[1], out[0]}}, depth);
187     Tensor* output;
188     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
189     if (out_shape.num_elements() == 0) return;
190     LaunchPoolingOp<Device, T, Type>::launch(context, tensor_in, window, stride,
191                                              padding, data_format_, padding_,
192                                              output);
193   }
194 
195  private:
196   std::vector<int32> ksize_;
197   std::vector<int32> stride_;
198   Padding padding_;
199   TensorFormat data_format_;
200 };
201 
202 template <typename T>
203 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp204   static void launch(OpKernelContext* context, const Tensor& tensor_in,
205                      const Tensor& tensor_out, const Tensor& out_backprop,
206                      const std::array<int64, 3>& window,
207                      const std::array<int64, 3>& stride,
208                      const std::array<int64, 3>& out,
209                      const std::array<int64, 3>& padding,
210                      TensorFormat data_format, Tensor* output) {
211     output->flat<T>().setZero();
212     for (int64 p = 0; p < out_backprop.dim_size(3); ++p) {
213       // Calculate broadcast size for planes/rows/cols. For SAME padding,
214       // current index could be in the padding area, and
215       //   p * stride_planes + window_planes
216       // could be beyond the input tensor's boundary. In such cases, change
217       // the starting index and reduce the broadcast size.
218       //
219       // The same procedure is repeated for every spatial dimension in the
220       // nested loops below.
221       int pindex, psize;
222       std::array<int64, 3> input_size{{tensor_in.dim_size(3),
223                                        tensor_in.dim_size(2),
224                                        tensor_in.dim_size(1)}};
225       OP_REQUIRES_OK(context,
226                      GetBroadcastSize(p, input_size[0], window[0], stride[0],
227                                       padding[0], &pindex, &psize));
228       for (int64 r = 0; r < out_backprop.dim_size(2); ++r) {
229         int rindex, rsize;
230         OP_REQUIRES_OK(context,
231                        GetBroadcastSize(r, input_size[1], window[1], stride[1],
232                                         padding[1], &rindex, &rsize));
233         for (int64 c = 0; c < out_backprop.dim_size(1); ++c) {
234           int cindex, csize;
235           OP_REQUIRES_OK(
236               context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
237                                         padding[2], &cindex, &csize));
238           TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
239           TensorSlice dst{{0, -1},
240                           {cindex, csize},
241                           {rindex, rsize},
242                           {pindex, psize},
243                           {0, -1}};
244           Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
245           Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
246           Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
247           Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
248           src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
249                                      &src_sizes);
250           dst.FillIndicesAndSizes<5>(tensor_in.shape(), &dst_indices,
251                                      &dst_sizes);
252 
253 #if !defined(EIGEN_HAS_INDEX_LIST)
254           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
255 #else
256           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
257                            Eigen::type2index<1>>
258               bcast;
259           bcast.set(1, csize);
260           bcast.set(2, rsize);
261           bcast.set(3, psize);
262 #endif
263 
264           // Slice from tensor_in.
265           Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_in_slice(dst_sizes);
266           tensor_in_slice.device(context->eigen_cpu_device()) =
267               tensor_in.tensor<T, 5>().slice(dst_indices, dst_sizes);
268 
269           // Slice from tensor_out.
270           Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_out_slice(src_sizes);
271           tensor_out_slice.device(context->eigen_cpu_device()) =
272               tensor_out.tensor<T, 5>().slice(src_indices, src_sizes);
273 
274           // Backprop slice.
275           Eigen::Tensor<T, 5, Eigen::RowMajor> out_backprop_slice(src_sizes);
276           out_backprop_slice.device(context->eigen_cpu_device()) =
277               out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
278 
279           // The true backprop slice: if an element is the max, choose
280           // the backprop slice; otherwise set to 0.
281           Eigen::Tensor<T, 5, Eigen::RowMajor> select_slice(dst_sizes);
282           Eigen::Tensor<T, 5, Eigen::RowMajor> mat0(dst_sizes);
283           mat0.setZero();
284           select_slice =
285               ((tensor_in_slice - tensor_out_slice.broadcast(bcast)).abs() <
286                tensor_in_slice.constant(1e-5))
287                   .select(out_backprop_slice.broadcast(bcast), mat0);
288 
289           output->tensor<T, 5>()
290               .slice(dst_indices, dst_sizes)
291               .device(context->eigen_cpu_device()) += select_slice;
292         }
293       }
294     }
295   }
296 };
297 
298 template <class Device, class T>
299 class MaxPooling3dGradOp : public OpKernel {
300  public:
MaxPooling3dGradOp(OpKernelConstruction * context)301   explicit MaxPooling3dGradOp(OpKernelConstruction* context)
302       : OpKernel(context) {
303     string data_format;
304     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
305     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
306                 errors::InvalidArgument("Invalid data format"));
307     if (context->device_type() == DEVICE_CPU) {
308       OP_REQUIRES(
309           context, data_format_ == FORMAT_NHWC,
310           errors::InvalidArgument(
311               "Default MaxPooling3dGradOp only supports NDHWC ",
312               "on device type ", DeviceTypeString(context->device_type())));
313     }
314     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
315     OP_REQUIRES(context, ksize_.size() == 5,
316                 errors::InvalidArgument("Sliding window ksize field must "
317                                         "specify 5 dimensions"));
318     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
319     OP_REQUIRES(context, stride_.size() == 5,
320                 errors::InvalidArgument("Sliding window stride field must "
321                                         "specify 5 dimensions"));
322     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
323     OP_REQUIRES(context,
324                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
325                  GetTensorDim(stride_, data_format_, 'N') == 1),
326                 errors::Unimplemented(
327                     "Pooling is not yet supported on the batch dimension."));
328     OP_REQUIRES(context,
329                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
330                  GetTensorDim(stride_, data_format_, 'C') == 1),
331                 errors::Unimplemented(
332                     "Pooling is not yet supported on the depth dimension."));
333   }
334 
Compute(OpKernelContext * context)335   void Compute(OpKernelContext* context) override {
336     const Tensor& tensor_in = context->input(0);
337     const Tensor& tensor_out = context->input(1);
338     const Tensor& out_backprop = context->input(2);
339     OP_REQUIRES(context, tensor_in.dims() == 5,
340                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
341     OP_REQUIRES(context, tensor_out.dims() == 5,
342                 errors::InvalidArgument("tensor_out must be 5-dimensional"));
343     OP_REQUIRES(context, out_backprop.dims() == 5,
344                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
345 
346     const TensorShape& output_shape = tensor_in.shape();
347     Tensor* input_backprop;
348     OP_REQUIRES_OK(context,
349                    context->allocate_output(0, output_shape, &input_backprop));
350     std::array<int64, 3> input_size{
351         {GetTensorDim(output_shape, data_format_, '2'),
352          GetTensorDim(output_shape, data_format_, '1'),
353          GetTensorDim(output_shape, data_format_, '0')}};
354     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
355                                  GetTensorDim(ksize_, data_format_, '1'),
356                                  GetTensorDim(ksize_, data_format_, '0')}};
357     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
358                                  GetTensorDim(stride_, data_format_, '1'),
359                                  GetTensorDim(stride_, data_format_, '0')}};
360     std::array<int64, 3> out, padding;
361 
362     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
363                                             padding_, &out, &padding));
364     LaunchMaxPooling3dGradOp<Device, T>::launch(
365         context, tensor_in, tensor_out, out_backprop, window, stride, out,
366         padding, data_format_, input_backprop);
367   }
368 
369  private:
370   std::vector<int32> ksize_;
371   std::vector<int32> stride_;
372   Padding padding_;
373   TensorFormat data_format_;
374 };
375 
376 template <typename T>
377 struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp378   static void launch(OpKernelContext* context,
379                      const TensorShape& tensor_in_shape,
380                      const Tensor& out_backprop,
381                      const std::array<int64, 3>& window,
382                      const std::array<int64, 3>& stride,
383                      const std::array<int64, 3>& output_shape,
384                      const std::array<int64, 3>& padding,
385                      TensorFormat data_format, Tensor* output) {
386     output->flat<T>().setZero();
387     std::array<int64, 3> input_size = {{tensor_in_shape.dim_size(3),
388                                         tensor_in_shape.dim_size(2),
389                                         tensor_in_shape.dim_size(1)}};
390     for (int64 p = 0; p < out_backprop.dim_size(3); ++p) {
391       // Calculate broadcast size for planes/rows/cols. For SAME padding,
392       // current index could be in the padding area, and
393       //   p * stride_planes + window_planes
394       // could be beyond the input tensor's boundary. In such cases, change
395       // the starting index and reduce the broadcast size.
396       //
397       // The same procedure is repeated for every spatial dimension in the
398       // nested loops below.
399       int pindex, psize;
400       OP_REQUIRES_OK(context,
401                      GetBroadcastSize(p, input_size[0], window[0], stride[0],
402                                       padding[0], &pindex, &psize));
403       for (int64 r = 0; r < out_backprop.dim_size(2); ++r) {
404         int rindex, rsize;
405         OP_REQUIRES_OK(context,
406                        GetBroadcastSize(r, input_size[1], window[1], stride[1],
407                                         padding[1], &rindex, &rsize));
408         for (int64 c = 0; c < out_backprop.dim_size(1); ++c) {
409           int cindex, csize;
410           OP_REQUIRES_OK(
411               context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
412                                         padding[2], &cindex, &csize));
413           TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
414           TensorSlice dst{{0, -1},
415                           {cindex, csize},
416                           {rindex, rsize},
417                           {pindex, psize},
418                           {0, -1}};
419           Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
420           Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
421           Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
422           Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
423           src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
424                                      &src_sizes);
425           dst.FillIndicesAndSizes<5>(tensor_in_shape, &dst_indices, &dst_sizes);
426 #if !defined(EIGEN_HAS_INDEX_LIST)
427           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
428 #else
429           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
430                            Eigen::type2index<1>>
431               bcast;
432           bcast.set(1, csize);
433           bcast.set(2, rsize);
434           bcast.set(3, psize);
435 #endif
436           Eigen::Tensor<T, 5, Eigen::RowMajor> slices(src_sizes);
437           slices.device(context->eigen_cpu_device()) =
438               out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
439           // Divide by the size of the actual patch (psize * rsize * csize).
440           float divide_size = rsize * csize * psize * 1.0f;
441           slices *= slices.constant(1.0f / divide_size);
442 
443           output->tensor<T, 5>()
444               .slice(dst_indices, dst_sizes)
445               .device(context->eigen_cpu_device()) += slices.broadcast(bcast);
446         }
447       }
448     }
449   }
450 };
451 
452 template <class Device, class T>
453 class AvgPooling3dGradOp : public OpKernel {
454  public:
AvgPooling3dGradOp(OpKernelConstruction * context)455   explicit AvgPooling3dGradOp(OpKernelConstruction* context)
456       : OpKernel(context) {
457     string data_format;
458     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
459     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
460                 errors::InvalidArgument("Invalid data format"));
461     if (context->device_type() == DEVICE_CPU) {
462       OP_REQUIRES(
463           context, data_format_ == FORMAT_NHWC,
464           errors::InvalidArgument(
465               "Default AvgPooling3dGradOp only supports NDHWC ",
466               "on device type ", DeviceTypeString(context->device_type())));
467     }
468     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
469     OP_REQUIRES(context, ksize_.size() == 5,
470                 errors::InvalidArgument("Sliding window ksize field must "
471                                         "specify 5 dimensions"));
472     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
473     OP_REQUIRES(context, stride_.size() == 5,
474                 errors::InvalidArgument("Sliding window stride field must "
475                                         "specify 5 dimensions"));
476     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
477     OP_REQUIRES(context,
478                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
479                  GetTensorDim(stride_, data_format_, 'N') == 1),
480                 errors::Unimplemented(
481                     "Pooling is not yet supported on the batch dimension."));
482     OP_REQUIRES(context,
483                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
484                  GetTensorDim(stride_, data_format_, 'C') == 1),
485                 errors::Unimplemented(
486                     "Pooling is not yet supported on the depth dimension."));
487   }
488 
Compute(OpKernelContext * context)489   void Compute(OpKernelContext* context) override {
490     const Tensor& tensor_in_shape = context->input(0);
491     const Tensor& out_backprop = context->input(1);
492     OP_REQUIRES(
493         context,
494         tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 5,
495         errors::InvalidArgument("tensor_in must be 1-dimensional and 5 "
496                                 "elements"));
497     OP_REQUIRES(context, out_backprop.dims() == 5,
498                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
499 
500     TensorShape output_shape;
501     auto shape_vec = tensor_in_shape.vec<int32>();
502     for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
503       output_shape.AddDim(shape_vec(i));
504     }
505 
506     Tensor* output;
507     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
508 
509     // Dimension order for these arrays is x, y, z.
510     std::array<int64, 3> input_size{
511         {GetTensorDim(output_shape, data_format_, '2'),
512          GetTensorDim(output_shape, data_format_, '1'),
513          GetTensorDim(output_shape, data_format_, '0')}};
514     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
515                                  GetTensorDim(ksize_, data_format_, '1'),
516                                  GetTensorDim(ksize_, data_format_, '0')}};
517     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
518                                  GetTensorDim(stride_, data_format_, '1'),
519                                  GetTensorDim(stride_, data_format_, '0')}};
520     std::array<int64, 3> padding, out;
521 
522     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
523                                             padding_, &out, &padding));
524 
525     LaunchAvgPooling3dGradOp<Device, T>::launch(
526         context, output_shape, out_backprop, window, stride, out, padding,
527         data_format_, output);
528   }
529 
530  private:
531   std::vector<int32> ksize_;
532   std::vector<int32> stride_;
533   Padding padding_;
534   TensorFormat data_format_;
535 };
536 
537 template <typename T>
538 struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp539   static void launch(OpKernelContext* context, const Pool3dParameters& params,
540                      const Tensor& tensor_in, const Tensor& tensor_out,
541                      const Tensor& tensor_top_diff,
542                      Tensor* tensor_bottom_diff) {
543     OP_REQUIRES(
544         context, params.data_format == FORMAT_NHWC,
545         errors::InvalidArgument("Default MaxPooling3dGradGradOp only supports",
546                                 "NDHWC on CPU device type"));
547 
548     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
549         ConstEigenMatrixMap;
550     typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
551         EigenMatrixMap;
552 
553     ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
554                                params.tensor_in_planes * params.tensor_in_cols *
555                                    params.tensor_in_rows *
556                                    params.tensor_in_batch);
557     ConstEigenMatrixMap out_mat(tensor_out.flat<T>().data(), params.depth,
558                                 params.out_plane * params.out_width *
559                                     params.out_height * params.tensor_in_batch);
560     ConstEigenMatrixMap top_diff_mat(
561         tensor_top_diff.flat<T>().data(), params.depth,
562         params.tensor_in_planes * params.tensor_in_cols *
563             params.tensor_in_rows * params.tensor_in_batch);
564     EigenMatrixMap bottom_diff_mat(
565         tensor_bottom_diff->flat<T>().data(), params.depth,
566         params.out_plane * params.out_width * params.out_height *
567             params.tensor_in_batch);
568 
569     const DeviceBase::CpuWorkerThreads& worker_threads =
570         *(context->device()->tensorflow_cpu_worker_threads());
571 
572     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
573                      int64 start, int64 limit) {
574       const int32 depth = params.depth;
575       const int32 in_planes = params.tensor_in_planes;
576       const int32 in_rows = params.tensor_in_rows;
577       const int32 in_cols = params.tensor_in_cols;
578       const int32 pad_planes = params.pad_planes;
579       const int32 pad_rows = params.pad_rows;
580       const int32 pad_cols = params.pad_cols;
581       const int32 window_planes = params.window_planes;
582       const int32 window_rows = params.window_rows;
583       const int32 window_cols = params.window_cols;
584       const int32 plane_stride = params.plane_stride;
585       const int32 row_stride = params.row_stride;
586       const int32 col_stride = params.col_stride;
587       const int32 out_plane = params.out_plane;
588       const int32 out_height = params.out_height;
589       const int32 out_width = params.out_width;
590 
591       {
592         // Initializes the output grad backprop tensor with 0.
593         const int32 output_image_size =
594             out_plane * out_height * out_width * params.depth;
595         EigenMatrixMap bottom_diff_shard(
596             bottom_diff_mat.data() + start * output_image_size, 1,
597             (limit - start) * output_image_size);
598         bottom_diff_shard.setZero();
599       }
600 
601       for (int b = start; b < limit; ++b) {
602         for (int pp = 0; pp < out_plane; ++pp) {
603           for (int ph = 0; ph < out_height; ++ph) {
604             for (int pw = 0; pw < out_width; ++pw) {
605               // (p_start, p_end) * (h_start, h_end) * (w_start, w_end) is the
606               // range that the input vector projects to.
607               int p_start = pp * plane_stride - pad_planes;
608               const int p_end = std::min(p_start + window_planes, in_planes);
609               int h_start = ph * row_stride - pad_rows;
610               const int h_end = std::min(h_start + window_rows, in_rows);
611               int w_start = pw * col_stride - pad_cols;
612               const int w_end = std::min(w_start + window_cols, in_cols);
613               p_start = std::max(p_start, 0);
614               h_start = std::max(h_start, 0);
615               w_start = std::max(w_start, 0);
616               const int out_index =
617                   ((b * out_plane + pp) * out_height + ph) * out_width + pw;
618               // Find value corresponding to the input maximum in top_diff.
619               for (int d = 0; d < depth; ++d) {
620                 const T& output_ref = out_mat.coeffRef(d, out_index);
621                 bool should_stop = false;
622                 for (int p = p_start; p < p_end && !should_stop; ++p) {
623                   for (int h = h_start; h < h_end && !should_stop; ++h) {
624                     for (int w = w_start; w < w_end && !should_stop; ++w) {
625                       const int in_index =
626                           ((b * in_planes + p) * in_rows + h) * in_cols + w;
627                       const T& input_ref = in_mat.coeffRef(d, in_index);
628                       if (output_ref == input_ref) {
629                         T& bottom_diff_ref =
630                             bottom_diff_mat.coeffRef(d, out_index);
631                         bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
632                         should_stop = true;
633                       }
634                     }
635                   }
636                 }
637               }
638             }
639           }
640         }
641       }
642     };
643     const int64 shard_cost =
644         params.out_plane * params.out_height * params.out_width * params.depth *
645         params.window_planes * params.window_rows * params.window_cols;
646     Shard(worker_threads.num_threads, worker_threads.workers,
647           params.tensor_in_batch, shard_cost, shard);
648   }
649 };
650 
651 template <class Device, class T>
652 class MaxPooling3dGradGradOp : public OpKernel {
653  public:
MaxPooling3dGradGradOp(OpKernelConstruction * context)654   explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
655       : OpKernel(context) {
656     string data_format;
657     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
658     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
659                 errors::InvalidArgument("Invalid data format"));
660     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
661     OP_REQUIRES(context, ksize_.size() == 5,
662                 errors::InvalidArgument("Sliding window ksize field must "
663                                         "specify 5 dimensions"));
664     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
665     OP_REQUIRES(context, stride_.size() == 5,
666                 errors::InvalidArgument("Sliding window strides field must "
667                                         "specify 5 dimensions"));
668     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
669     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
670                 errors::Unimplemented(
671                     "Pooling is not yet supported on the batch dimension."));
672     const int32 ksize_c = GetTensorDim(ksize_, data_format_, 'C');
673     const int32 stride_c = GetTensorDim(stride_, data_format_, 'C');
674     OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
675                 errors::Unimplemented("MaxPooling3dGradGrad is not yet "
676                                       "supported on the depth dimension."));
677   }
678 
Compute(OpKernelContext * context)679   void Compute(OpKernelContext* context) override {
680     const Tensor& tensor_in = context->input(0);
681     const Tensor& tensor_out = context->input(1);
682     const Tensor& out_grad_backprop = context->input(2);
683 
684     // For maxpooling3d, tensor_in should have 5 dimensions.
685     OP_REQUIRES(context, tensor_in.dims() == 5,
686                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
687     OP_REQUIRES(context, tensor_out.dims() == 5,
688                 errors::InvalidArgument("tensor_out must be 5-dimensional"));
689     // For maxpooling3d, out_grad_backprop should have 5 dimensions.
690     OP_REQUIRES(
691         context, out_grad_backprop.dims() == 5,
692         errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
693 
694     Pool3dParameters params{context,  ksize_,       stride_,
695                             padding_, data_format_, tensor_in.shape()};
696 
697     Tensor* output = nullptr;
698     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
699                                 {2}, 0, tensor_out.shape(), &output));
700 
701     LaunchMaxPooling3dGradGradOp<Device, T>::launch(
702         context, params, tensor_in, tensor_out, out_grad_backprop, output);
703   }
704 
705  private:
706   std::vector<int32> ksize_;
707   std::vector<int32> stride_;
708   Padding padding_;
709   TensorFormat data_format_;
710 };
711 
712 #define REGISTER_KERNELS(D, T)                                             \
713   REGISTER_KERNEL_BUILDER(                                                 \
714       Name("MaxPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
715       Pooling3DOp<D##Device, T, MAX>);                                     \
716   REGISTER_KERNEL_BUILDER(Name("MaxPool3DGrad")                            \
717                               .Device(DEVICE_##D)                          \
718                               .TypeConstraint<T>("T")                      \
719                               .TypeConstraint<T>("TInput"),                \
720                           MaxPooling3dGradOp<D##Device, T>);               \
721   REGISTER_KERNEL_BUILDER(                                                 \
722       Name("MaxPool3DGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
723       MaxPooling3dGradGradOp<D##Device, T>);                               \
724   REGISTER_KERNEL_BUILDER(                                                 \
725       Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
726       Pooling3DOp<D##Device, T, AVG>);                                     \
727   REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")                            \
728                               .Device(DEVICE_##D)                          \
729                               .TypeConstraint<T>("T")                      \
730                               .HostMemory("orig_input_shape"),             \
731                           AvgPooling3dGradOp<D##Device, T>);
732 
733 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
734 TF_CALL_float(REGISTER_CPU_KERNELS);
735 #undef REGISTER_CPU_KERNELS
736 
737 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
738 
739 template <typename T>
740 struct LaunchPoolingOp<GPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp741   static void launch(OpKernelContext* context, const Tensor& tensor_in,
742                      const std::array<int64, 3>& window,
743                      const std::array<int64, 3>& stride,
744                      const std::array<int64, 3>& padding,
745                      TensorFormat data_format, Padding padding_type,
746                      Tensor* output) {
747     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
748                                stride, padding, data_format, tensor_in, output);
749   }
750 };
751 
752 template <typename T>
753 struct LaunchPoolingOp<GPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp754   static void launch(OpKernelContext* context, const Tensor& tensor_in,
755                      const std::array<int64, 3>& window,
756                      const std::array<int64, 3>& stride,
757                      const std::array<int64, 3>& padding,
758                      TensorFormat data_format, Padding padding_type,
759                      Tensor* output) {
760     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
761                                stride, padding, data_format, tensor_in, output);
762   }
763 };
764 
765 template <typename T>
766 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp767   static void launch(OpKernelContext* context, const Tensor& tensor_in,
768                      const Tensor& tensor_out, const Tensor& out_backprop,
769                      const std::array<int64, 3>& window,
770                      const std::array<int64, 3>& stride,
771                      const std::array<int64, 3>& out,
772                      const std::array<int64, 3>& padding,
773                      TensorFormat data_format, Tensor* input_backprop) {
774     const TensorShape output_shape = tensor_in.shape();
775     DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
776                                    window, stride, padding, out, data_format,
777                                    out_backprop, output_shape, &tensor_in,
778                                    &tensor_out, input_backprop);
779   }
780 };
781 
782 template <typename T>
783 struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp784   static void launch(OpKernelContext* context,
785                      const TensorShape& tensor_in_shape,
786                      const Tensor& out_backprop,
787                      const std::array<int64, 3>& window,
788                      const std::array<int64, 3>& stride,
789                      const std::array<int64, 3>& out,
790                      const std::array<int64, 3>& padding,
791                      TensorFormat data_format, Tensor* output) {
792     DnnPooling3dGradOp<T>::Compute(
793         context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
794         data_format, out_backprop, tensor_in_shape, nullptr, nullptr, output);
795   }
796 };
797 
798 template <typename T>
799 struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp800   static void launch(OpKernelContext* context, const Pool3dParameters& params,
801                      const Tensor& tensor_in, const Tensor& tensor_out,
802                      const Tensor& tensor_top_diff,
803                      Tensor* tensor_bottom_diff) {
804     bool status = functor::MaxPool3dGradBackward<T>()(
805         params.data_format, tensor_in.flat<T>().data(),
806         tensor_out.flat<T>().data(), params.tensor_in_batch, params.out_plane,
807         params.out_height, params.out_width, params.depth,
808         params.tensor_in_planes, params.tensor_in_rows, params.tensor_in_cols,
809         params.window_planes, params.window_rows, params.window_cols,
810         params.plane_stride, params.row_stride, params.col_stride,
811         params.pad_planes, params.pad_rows, params.pad_cols,
812         tensor_top_diff.flat<T>().data(), tensor_bottom_diff->flat<T>().data(),
813         context->eigen_gpu_device());
814     if (!status) {
815       context->SetStatus(
816           errors::Internal("Failed launching MaxPool3dGradBackward"));
817     }
818   }
819 };
820 
821 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
822 TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
823 #undef REGISTER_GPU_KERNELS
824 
825 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
826 
827 
828 #undef REGISTER_KERNELS
829 
830 }  // namespace tensorflow
831