1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #define EIGEN_USE_THREADS
16
17 #include "tensorflow/core/kernels/pooling_ops_3d.h"
18
19 #include <array>
20
21 #include "third_party/eigen3/Eigen/Core"
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/kernel_shape_util.h"
24 #include "tensorflow/core/framework/numeric_op.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/register_types.h"
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/framework/tensor_shape.h"
29 #include "tensorflow/core/framework/tensor_slice.h"
30 #include "tensorflow/core/kernels/eigen_pooling.h"
31 #include "tensorflow/core/kernels/ops_util.h"
32 #include "tensorflow/core/lib/core/errors.h"
33 #include "tensorflow/core/util/padding.h"
34 #include "tensorflow/core/util/tensor_format.h"
35 #include "tensorflow/core/util/work_sharder.h"
36
37 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
39 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
40 #endif
41
42
43 namespace tensorflow {
44
45 typedef Eigen::ThreadPoolDevice CPUDevice;
46 typedef Eigen::GpuDevice GPUDevice;
47
Pool3dParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const TensorShape & tensor_in_shape)48 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
49 const std::vector<int32>& ksize,
50 const std::vector<int32>& stride,
51 Padding padding, TensorFormat data_format,
52 const TensorShape& tensor_in_shape) {
53 // For maxpooling, tensor_in should have 4 dimensions.
54 OP_REQUIRES(context, tensor_in_shape.dims() == 5,
55 errors::InvalidArgument("tensor_in must be 4-dimensional"));
56
57 this->data_format = data_format;
58 depth = GetTensorDim(tensor_in_shape, data_format, 'C');
59 tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
60 tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
61 tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
62 tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
63 window_planes = GetTensorDim(ksize, data_format, '0');
64 window_rows = GetTensorDim(ksize, data_format, '1');
65 window_cols = GetTensorDim(ksize, data_format, '2');
66 depth_window = GetTensorDim(ksize, data_format, 'C');
67 plane_stride = GetTensorDim(stride, data_format, '0');
68 row_stride = GetTensorDim(stride, data_format, '1');
69 col_stride = GetTensorDim(stride, data_format, '2');
70 depth_stride = GetTensorDim(stride, data_format, 'C');
71
72 // We only support 3D pooling across plane/width/height. Depthwise
73 // pooling is not supported.
74 OP_REQUIRES(
75 context, depth_window == 1 && depth_stride == 1,
76 errors::Unimplemented(
77 "Pooling3d only supports pooling across plane/width/height."));
78
79 OP_REQUIRES_OK(context, GetWindowedOutputSize(tensor_in_planes, window_planes,
80 plane_stride, padding,
81 &out_plane, &pad_planes));
82 OP_REQUIRES_OK(context,
83 GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
84 padding, &out_height, &pad_rows));
85 OP_REQUIRES_OK(context,
86 GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
87 padding, &out_width, &pad_cols));
88 }
89
forward_output_shape()90 TensorShape Pool3dParameters::forward_output_shape() {
91 return ShapeFromFormat(data_format, tensor_in_batch,
92 {{out_plane, out_height, out_width}}, depth);
93 }
94
95 template <typename T>
96 struct LaunchPoolingOp<CPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp97 static void launch(OpKernelContext* context, const Tensor& tensor_in,
98 const std::array<int64, 3>& window,
99 const std::array<int64, 3>& stride,
100 const std::array<int64, 3>& padding,
101 TensorFormat data_format, Padding padding_type,
102 Tensor* output) {
103 output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
104 Eigen::CuboidAvgPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
105 window[2], stride[0], stride[1], stride[2],
106 BrainPadding2EigenPadding(padding_type));
107 }
108 };
109
110 template <typename T>
111 struct LaunchPoolingOp<CPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp112 static void launch(OpKernelContext* context, const Tensor& tensor_in,
113 const std::array<int64, 3>& window,
114 const std::array<int64, 3>& stride,
115 const std::array<int64, 3>& padding,
116 TensorFormat data_format, Padding padding_type,
117 Tensor* output) {
118 output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
119 Eigen::CuboidMaxPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
120 window[2], stride[0], stride[1], stride[2],
121 BrainPadding2EigenPadding(padding_type));
122 }
123 };
124
125 template <typename Device, typename T, PoolingType Type>
126 class Pooling3DOp : public UnaryOp<T> {
127 public:
Pooling3DOp(OpKernelConstruction * context)128 explicit Pooling3DOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
129 string data_format;
130 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
131 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
132 errors::InvalidArgument("Invalid data format"));
133 if (context->device_type() == DEVICE_CPU) {
134 OP_REQUIRES(
135 context, data_format_ == FORMAT_NHWC,
136 errors::InvalidArgument("Default Pooling3DOp only supports NDHWC ",
137 "on device type ",
138 DeviceTypeString(context->device_type())));
139 }
140 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
141 OP_REQUIRES(context, ksize_.size() == 5,
142 errors::InvalidArgument("Sliding window ksize field must "
143 "specify 5 dimensions"));
144 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
145 OP_REQUIRES(context, stride_.size() == 5,
146 errors::InvalidArgument("Sliding window stride field must "
147 "specify 5 dimensions"));
148 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
149 OP_REQUIRES(context,
150 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
151 GetTensorDim(stride_, data_format_, 'N') == 1),
152 errors::Unimplemented(
153 "Pooling is not yet supported on the batch dimension."));
154 OP_REQUIRES(context,
155 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
156 GetTensorDim(stride_, data_format_, 'C') == 1),
157 errors::Unimplemented(
158 "Pooling is not yet supported on the depth dimension."));
159 }
160
Compute(OpKernelContext * context)161 void Compute(OpKernelContext* context) override {
162 const Tensor& tensor_in = context->input(0);
163
164 OP_REQUIRES(context, tensor_in.dims() == 5,
165 errors::InvalidArgument("tensor_in must be 5-dimensional"));
166 const int64_t depth = GetTensorDim(tensor_in, data_format_, 'C');
167 const int64_t in_batch = GetTensorDim(tensor_in, data_format_, 'N');
168
169 // Dimension order for these arrays is: x, y, z.
170 std::array<int64, 3> input_size{
171 {GetTensorDim(tensor_in, data_format_, '2'),
172 GetTensorDim(tensor_in, data_format_, '1'),
173 GetTensorDim(tensor_in, data_format_, '0')}};
174 std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
175 GetTensorDim(ksize_, data_format_, '1'),
176 GetTensorDim(ksize_, data_format_, '0')}};
177 std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
178 GetTensorDim(stride_, data_format_, '1'),
179 GetTensorDim(stride_, data_format_, '0')}};
180 std::array<int64, 3> padding, out;
181
182 OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
183 padding_, &out, &padding));
184
185 TensorShape out_shape = ShapeFromFormat(data_format_, in_batch,
186 {{out[2], out[1], out[0]}}, depth);
187 Tensor* output;
188 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
189 if (out_shape.num_elements() == 0) return;
190 LaunchPoolingOp<Device, T, Type>::launch(context, tensor_in, window, stride,
191 padding, data_format_, padding_,
192 output);
193 }
194
195 private:
196 std::vector<int32> ksize_;
197 std::vector<int32> stride_;
198 Padding padding_;
199 TensorFormat data_format_;
200 };
201
202 template <typename T>
203 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp204 static void launch(OpKernelContext* context, const Tensor& tensor_in,
205 const Tensor& tensor_out, const Tensor& out_backprop,
206 const std::array<int64, 3>& window,
207 const std::array<int64, 3>& stride,
208 const std::array<int64, 3>& out,
209 const std::array<int64, 3>& padding,
210 TensorFormat data_format, Tensor* output) {
211 output->flat<T>().setZero();
212 for (int64_t p = 0; p < out_backprop.dim_size(3); ++p) {
213 // Calculate broadcast size for planes/rows/cols. For SAME padding,
214 // current index could be in the padding area, and
215 // p * stride_planes + window_planes
216 // could be beyond the input tensor's boundary. In such cases, change
217 // the starting index and reduce the broadcast size.
218 //
219 // The same procedure is repeated for every spatial dimension in the
220 // nested loops below.
221 int pindex, psize;
222 std::array<int64, 3> input_size{{tensor_in.dim_size(3),
223 tensor_in.dim_size(2),
224 tensor_in.dim_size(1)}};
225 OP_REQUIRES_OK(context,
226 GetBroadcastSize(p, input_size[0], window[0], stride[0],
227 padding[0], &pindex, &psize));
228 for (int64_t r = 0; r < out_backprop.dim_size(2); ++r) {
229 int rindex, rsize;
230 OP_REQUIRES_OK(context,
231 GetBroadcastSize(r, input_size[1], window[1], stride[1],
232 padding[1], &rindex, &rsize));
233 for (int64_t c = 0; c < out_backprop.dim_size(1); ++c) {
234 int cindex, csize;
235 OP_REQUIRES_OK(
236 context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
237 padding[2], &cindex, &csize));
238 TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
239 TensorSlice dst{{0, -1},
240 {cindex, csize},
241 {rindex, rsize},
242 {pindex, psize},
243 {0, -1}};
244 Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
245 Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
246 Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
247 Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
248 src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
249 &src_sizes);
250 dst.FillIndicesAndSizes<5>(tensor_in.shape(), &dst_indices,
251 &dst_sizes);
252
253 #if !defined(EIGEN_HAS_INDEX_LIST)
254 Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
255 #else
256 Eigen::IndexList<Eigen::type2index<1>, int, int, int,
257 Eigen::type2index<1>>
258 bcast;
259 bcast.set(1, csize);
260 bcast.set(2, rsize);
261 bcast.set(3, psize);
262 #endif
263
264 // Slice from tensor_in.
265 Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_in_slice(dst_sizes);
266 tensor_in_slice.device(context->eigen_cpu_device()) =
267 tensor_in.tensor<T, 5>().slice(dst_indices, dst_sizes);
268
269 // Slice from tensor_out.
270 Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_out_slice(src_sizes);
271 tensor_out_slice.device(context->eigen_cpu_device()) =
272 tensor_out.tensor<T, 5>().slice(src_indices, src_sizes);
273
274 // Backprop slice.
275 Eigen::Tensor<T, 5, Eigen::RowMajor> out_backprop_slice(src_sizes);
276 out_backprop_slice.device(context->eigen_cpu_device()) =
277 out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
278
279 // The true backprop slice: if an element is the max, choose
280 // the backprop slice; otherwise set to 0.
281 Eigen::Tensor<T, 5, Eigen::RowMajor> select_slice(dst_sizes);
282 Eigen::Tensor<T, 5, Eigen::RowMajor> mat0(dst_sizes);
283 mat0.setZero();
284 select_slice =
285 ((tensor_in_slice - tensor_out_slice.broadcast(bcast)).abs() <
286 tensor_in_slice.constant(1e-5))
287 .select(out_backprop_slice.broadcast(bcast), mat0);
288
289 output->tensor<T, 5>()
290 .slice(dst_indices, dst_sizes)
291 .device(context->eigen_cpu_device()) += select_slice;
292 }
293 }
294 }
295 }
296 };
297
298 template <class Device, class T>
299 class MaxPooling3dGradOp : public OpKernel {
300 public:
MaxPooling3dGradOp(OpKernelConstruction * context)301 explicit MaxPooling3dGradOp(OpKernelConstruction* context)
302 : OpKernel(context) {
303 string data_format;
304 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
305 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
306 errors::InvalidArgument("Invalid data format"));
307 if (context->device_type() == DEVICE_CPU) {
308 OP_REQUIRES(
309 context, data_format_ == FORMAT_NHWC,
310 errors::InvalidArgument(
311 "Default MaxPooling3dGradOp only supports NDHWC ",
312 "on device type ", DeviceTypeString(context->device_type())));
313 }
314 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
315 OP_REQUIRES(context, ksize_.size() == 5,
316 errors::InvalidArgument("Sliding window ksize field must "
317 "specify 5 dimensions"));
318 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
319 OP_REQUIRES(context, stride_.size() == 5,
320 errors::InvalidArgument("Sliding window stride field must "
321 "specify 5 dimensions"));
322 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
323 OP_REQUIRES(context,
324 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
325 GetTensorDim(stride_, data_format_, 'N') == 1),
326 errors::Unimplemented(
327 "Pooling is not yet supported on the batch dimension."));
328 OP_REQUIRES(context,
329 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
330 GetTensorDim(stride_, data_format_, 'C') == 1),
331 errors::Unimplemented(
332 "Pooling is not yet supported on the depth dimension."));
333 }
334
Compute(OpKernelContext * context)335 void Compute(OpKernelContext* context) override {
336 const Tensor& tensor_in = context->input(0);
337 const Tensor& tensor_out = context->input(1);
338 const Tensor& out_backprop = context->input(2);
339 OP_REQUIRES(context, tensor_in.dims() == 5,
340 errors::InvalidArgument("tensor_in must be 5-dimensional"));
341 OP_REQUIRES(context, tensor_out.dims() == 5,
342 errors::InvalidArgument("tensor_out must be 5-dimensional"));
343 OP_REQUIRES(context, out_backprop.dims() == 5,
344 errors::InvalidArgument("out_backprop must be 5-dimensional"));
345
346 const TensorShape& output_shape = tensor_in.shape();
347 Tensor* input_backprop;
348 OP_REQUIRES_OK(context,
349 context->allocate_output(0, output_shape, &input_backprop));
350 std::array<int64, 3> input_size{
351 {GetTensorDim(output_shape, data_format_, '2'),
352 GetTensorDim(output_shape, data_format_, '1'),
353 GetTensorDim(output_shape, data_format_, '0')}};
354 std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
355 GetTensorDim(ksize_, data_format_, '1'),
356 GetTensorDim(ksize_, data_format_, '0')}};
357 std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
358 GetTensorDim(stride_, data_format_, '1'),
359 GetTensorDim(stride_, data_format_, '0')}};
360 std::array<int64, 3> out, padding;
361
362 OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
363 padding_, &out, &padding));
364 LaunchMaxPooling3dGradOp<Device, T>::launch(
365 context, tensor_in, tensor_out, out_backprop, window, stride, out,
366 padding, data_format_, input_backprop);
367 }
368
369 private:
370 std::vector<int32> ksize_;
371 std::vector<int32> stride_;
372 Padding padding_;
373 TensorFormat data_format_;
374 };
375
376 template <typename T>
377 struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp378 static void launch(OpKernelContext* context,
379 const TensorShape& tensor_in_shape,
380 const Tensor& out_backprop,
381 const std::array<int64, 3>& window,
382 const std::array<int64, 3>& stride,
383 const std::array<int64, 3>& output_shape,
384 const std::array<int64, 3>& padding,
385 TensorFormat data_format, Tensor* output) {
386 OP_REQUIRES(
387 context, tensor_in_shape.dim_size(0) == out_backprop.dim_size(0),
388 errors::InvalidArgument(
389 "Expected first dimension of tensor_in_shape and "
390 "out_backprop to match, got ",
391 tensor_in_shape.dim_size(0), " and ", out_backprop.dim_size(0)));
392 OP_REQUIRES(
393 context, tensor_in_shape.dim_size(4) == out_backprop.dim_size(4),
394 errors::InvalidArgument(
395 "Expected last dimension of tensor_in_shape and "
396 "out_backprop to match, got ",
397 tensor_in_shape.dim_size(4), " and ", out_backprop.dim_size(4)));
398
399 output->flat<T>().setZero();
400 std::array<int64, 3> input_size = {{tensor_in_shape.dim_size(3),
401 tensor_in_shape.dim_size(2),
402 tensor_in_shape.dim_size(1)}};
403 for (int64_t p = 0; p < out_backprop.dim_size(3); ++p) {
404 // Calculate broadcast size for planes/rows/cols. For SAME padding,
405 // current index could be in the padding area, and
406 // p * stride_planes + window_planes
407 // could be beyond the input tensor's boundary. In such cases, change
408 // the starting index and reduce the broadcast size.
409 //
410 // The same procedure is repeated for every spatial dimension in the
411 // nested loops below.
412 int pindex, psize;
413 OP_REQUIRES_OK(context,
414 GetBroadcastSize(p, input_size[0], window[0], stride[0],
415 padding[0], &pindex, &psize));
416 for (int64_t r = 0; r < out_backprop.dim_size(2); ++r) {
417 int rindex, rsize;
418 OP_REQUIRES_OK(context,
419 GetBroadcastSize(r, input_size[1], window[1], stride[1],
420 padding[1], &rindex, &rsize));
421 for (int64_t c = 0; c < out_backprop.dim_size(1); ++c) {
422 int cindex, csize;
423 OP_REQUIRES_OK(
424 context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
425 padding[2], &cindex, &csize));
426 TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
427 TensorSlice dst{{0, -1},
428 {cindex, csize},
429 {rindex, rsize},
430 {pindex, psize},
431 {0, -1}};
432 Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
433 Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
434 Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
435 Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
436 src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
437 &src_sizes);
438 dst.FillIndicesAndSizes<5>(tensor_in_shape, &dst_indices, &dst_sizes);
439 #if !defined(EIGEN_HAS_INDEX_LIST)
440 Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
441 #else
442 Eigen::IndexList<Eigen::type2index<1>, int, int, int,
443 Eigen::type2index<1>>
444 bcast;
445 bcast.set(1, csize);
446 bcast.set(2, rsize);
447 bcast.set(3, psize);
448 #endif
449 Eigen::Tensor<T, 5, Eigen::RowMajor> slices(src_sizes);
450 slices.device(context->eigen_cpu_device()) =
451 out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
452 // Divide by the size of the actual patch (psize * rsize * csize).
453 float divide_size = rsize * csize * psize * 1.0f;
454 slices *= slices.constant(1.0f / divide_size);
455
456 output->tensor<T, 5>()
457 .slice(dst_indices, dst_sizes)
458 .device(context->eigen_cpu_device()) += slices.broadcast(bcast);
459 }
460 }
461 }
462 }
463 };
464
465 template <class Device, class T>
466 class AvgPooling3dGradOp : public OpKernel {
467 public:
AvgPooling3dGradOp(OpKernelConstruction * context)468 explicit AvgPooling3dGradOp(OpKernelConstruction* context)
469 : OpKernel(context) {
470 string data_format;
471 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
472 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
473 errors::InvalidArgument("Invalid data format"));
474 if (context->device_type() == DEVICE_CPU) {
475 OP_REQUIRES(
476 context, data_format_ == FORMAT_NHWC,
477 errors::InvalidArgument(
478 "Default AvgPooling3dGradOp only supports NDHWC ",
479 "on device type ", DeviceTypeString(context->device_type())));
480 }
481 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
482 OP_REQUIRES(context, ksize_.size() == 5,
483 errors::InvalidArgument("Sliding window ksize field must "
484 "specify 5 dimensions"));
485 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
486 OP_REQUIRES(context, stride_.size() == 5,
487 errors::InvalidArgument("Sliding window stride field must "
488 "specify 5 dimensions"));
489 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
490 OP_REQUIRES(context,
491 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
492 GetTensorDim(stride_, data_format_, 'N') == 1),
493 errors::Unimplemented(
494 "Pooling is not yet supported on the batch dimension."));
495 OP_REQUIRES(context,
496 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
497 GetTensorDim(stride_, data_format_, 'C') == 1),
498 errors::Unimplemented(
499 "Pooling is not yet supported on the depth dimension."));
500 }
501
Compute(OpKernelContext * context)502 void Compute(OpKernelContext* context) override {
503 const Tensor& tensor_in_shape = context->input(0);
504 const Tensor& out_backprop = context->input(1);
505 OP_REQUIRES(
506 context,
507 tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 5,
508 errors::InvalidArgument("tensor_in must be 1-dimensional and 5 "
509 "elements"));
510 OP_REQUIRES(context, out_backprop.dims() == 5,
511 errors::InvalidArgument("out_backprop must be 5-dimensional"));
512
513 TensorShape output_shape;
514 auto shape_vec = tensor_in_shape.vec<int32>();
515 for (int64_t i = 0; i < tensor_in_shape.NumElements(); ++i) {
516 output_shape.AddDim(shape_vec(i));
517 }
518
519 Tensor* output;
520 OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
521
522 // Dimension order for these arrays is x, y, z.
523 std::array<int64, 3> input_size{
524 {GetTensorDim(output_shape, data_format_, '2'),
525 GetTensorDim(output_shape, data_format_, '1'),
526 GetTensorDim(output_shape, data_format_, '0')}};
527 std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
528 GetTensorDim(ksize_, data_format_, '1'),
529 GetTensorDim(ksize_, data_format_, '0')}};
530 std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
531 GetTensorDim(stride_, data_format_, '1'),
532 GetTensorDim(stride_, data_format_, '0')}};
533 std::array<int64, 3> padding, out;
534
535 OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
536 padding_, &out, &padding));
537
538 LaunchAvgPooling3dGradOp<Device, T>::launch(
539 context, output_shape, out_backprop, window, stride, out, padding,
540 data_format_, output);
541 }
542
543 private:
544 std::vector<int32> ksize_;
545 std::vector<int32> stride_;
546 Padding padding_;
547 TensorFormat data_format_;
548 };
549
550 template <typename T>
551 struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp552 static void launch(OpKernelContext* context, const Pool3dParameters& params,
553 const Tensor& tensor_in, const Tensor& tensor_out,
554 const Tensor& tensor_top_diff,
555 Tensor* tensor_bottom_diff) {
556 OP_REQUIRES(
557 context, params.data_format == FORMAT_NHWC,
558 errors::InvalidArgument("Default MaxPooling3dGradGradOp only supports",
559 "NDHWC on CPU device type"));
560
561 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
562 ConstEigenMatrixMap;
563 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
564 EigenMatrixMap;
565
566 ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
567 params.tensor_in_planes * params.tensor_in_cols *
568 params.tensor_in_rows *
569 params.tensor_in_batch);
570 ConstEigenMatrixMap out_mat(tensor_out.flat<T>().data(), params.depth,
571 params.out_plane * params.out_width *
572 params.out_height * params.tensor_in_batch);
573 ConstEigenMatrixMap top_diff_mat(
574 tensor_top_diff.flat<T>().data(), params.depth,
575 params.tensor_in_planes * params.tensor_in_cols *
576 params.tensor_in_rows * params.tensor_in_batch);
577 EigenMatrixMap bottom_diff_mat(
578 tensor_bottom_diff->flat<T>().data(), params.depth,
579 params.out_plane * params.out_width * params.out_height *
580 params.tensor_in_batch);
581
582 const DeviceBase::CpuWorkerThreads& worker_threads =
583 *(context->device()->tensorflow_cpu_worker_threads());
584
585 auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
586 int64_t start, int64_t limit) {
587 const int32_t depth = params.depth;
588 const int32_t in_planes = params.tensor_in_planes;
589 const int32_t in_rows = params.tensor_in_rows;
590 const int32_t in_cols = params.tensor_in_cols;
591 const int32_t pad_planes = params.pad_planes;
592 const int32_t pad_rows = params.pad_rows;
593 const int32_t pad_cols = params.pad_cols;
594 const int32_t window_planes = params.window_planes;
595 const int32_t window_rows = params.window_rows;
596 const int32_t window_cols = params.window_cols;
597 const int32_t plane_stride = params.plane_stride;
598 const int32_t row_stride = params.row_stride;
599 const int32_t col_stride = params.col_stride;
600 const int32_t out_plane = params.out_plane;
601 const int32_t out_height = params.out_height;
602 const int32_t out_width = params.out_width;
603
604 {
605 // Initializes the output grad backprop tensor with 0.
606 const int32_t output_image_size =
607 out_plane * out_height * out_width * params.depth;
608 EigenMatrixMap bottom_diff_shard(
609 bottom_diff_mat.data() + start * output_image_size, 1,
610 (limit - start) * output_image_size);
611 bottom_diff_shard.setZero();
612 }
613
614 for (int b = start; b < limit; ++b) {
615 for (int pp = 0; pp < out_plane; ++pp) {
616 for (int ph = 0; ph < out_height; ++ph) {
617 for (int pw = 0; pw < out_width; ++pw) {
618 // (p_start, p_end) * (h_start, h_end) * (w_start, w_end) is the
619 // range that the input vector projects to.
620 int p_start = pp * plane_stride - pad_planes;
621 const int p_end = std::min(p_start + window_planes, in_planes);
622 int h_start = ph * row_stride - pad_rows;
623 const int h_end = std::min(h_start + window_rows, in_rows);
624 int w_start = pw * col_stride - pad_cols;
625 const int w_end = std::min(w_start + window_cols, in_cols);
626 p_start = std::max(p_start, 0);
627 h_start = std::max(h_start, 0);
628 w_start = std::max(w_start, 0);
629 const int out_index =
630 ((b * out_plane + pp) * out_height + ph) * out_width + pw;
631 // Find value corresponding to the input maximum in top_diff.
632 for (int d = 0; d < depth; ++d) {
633 const T& output_ref = out_mat.coeffRef(d, out_index);
634 bool should_stop = false;
635 for (int p = p_start; p < p_end && !should_stop; ++p) {
636 for (int h = h_start; h < h_end && !should_stop; ++h) {
637 for (int w = w_start; w < w_end && !should_stop; ++w) {
638 const int in_index =
639 ((b * in_planes + p) * in_rows + h) * in_cols + w;
640 const T& input_ref = in_mat.coeffRef(d, in_index);
641 if (output_ref == input_ref) {
642 T& bottom_diff_ref =
643 bottom_diff_mat.coeffRef(d, out_index);
644 bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
645 should_stop = true;
646 }
647 }
648 }
649 }
650 }
651 }
652 }
653 }
654 }
655 };
656 const int64_t shard_cost =
657 params.out_plane * params.out_height * params.out_width * params.depth *
658 params.window_planes * params.window_rows * params.window_cols;
659 Shard(worker_threads.num_threads, worker_threads.workers,
660 params.tensor_in_batch, shard_cost, shard);
661 }
662 };
663
664 template <class Device, class T>
665 class MaxPooling3dGradGradOp : public OpKernel {
666 public:
MaxPooling3dGradGradOp(OpKernelConstruction * context)667 explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
668 : OpKernel(context) {
669 string data_format;
670 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
671 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
672 errors::InvalidArgument("Invalid data format"));
673 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
674 OP_REQUIRES(context, ksize_.size() == 5,
675 errors::InvalidArgument("Sliding window ksize field must "
676 "specify 5 dimensions"));
677 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
678 OP_REQUIRES(context, stride_.size() == 5,
679 errors::InvalidArgument("Sliding window strides field must "
680 "specify 5 dimensions"));
681 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
682 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
683 errors::Unimplemented(
684 "Pooling is not yet supported on the batch dimension."));
685 const int32_t ksize_c = GetTensorDim(ksize_, data_format_, 'C');
686 const int32_t stride_c = GetTensorDim(stride_, data_format_, 'C');
687 OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
688 errors::Unimplemented("MaxPooling3dGradGrad is not yet "
689 "supported on the depth dimension."));
690 }
691
Compute(OpKernelContext * context)692 void Compute(OpKernelContext* context) override {
693 const Tensor& tensor_in = context->input(0);
694 const Tensor& tensor_out = context->input(1);
695 const Tensor& out_grad_backprop = context->input(2);
696
697 // For maxpooling3d, tensor_in should have 5 dimensions.
698 OP_REQUIRES(context, tensor_in.dims() == 5,
699 errors::InvalidArgument("tensor_in must be 5-dimensional"));
700 OP_REQUIRES(context, tensor_out.dims() == 5,
701 errors::InvalidArgument("tensor_out must be 5-dimensional"));
702 // For maxpooling3d, out_grad_backprop should have 5 dimensions.
703 OP_REQUIRES(
704 context, out_grad_backprop.dims() == 5,
705 errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
706
707 Pool3dParameters params{context, ksize_, stride_,
708 padding_, data_format_, tensor_in.shape()};
709 if (!context->status().ok()) return; // params is invalid
710
711 Tensor* output = nullptr;
712 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
713 {2}, 0, tensor_out.shape(), &output));
714
715 // Given access patterns in LaunchMaxPooling3dGradGradOp, these tensors must
716 // have elements.
717 OP_REQUIRES(context, tensor_in.NumElements() > 0,
718 errors::InvalidArgument("received empty tensor tensor_in: ",
719 tensor_in.DebugString()));
720 OP_REQUIRES(context, tensor_out.NumElements() > 0,
721 errors::InvalidArgument("received empty tensor tensor_out: ",
722 tensor_out.DebugString()));
723 OP_REQUIRES(
724 context, out_grad_backprop.NumElements() > 0,
725 errors::InvalidArgument("received empty tensor out_grad_backprop: ",
726 out_grad_backprop.DebugString()));
727 OP_REQUIRES(context,
728 tensor_in.NumElements() == out_grad_backprop.NumElements(),
729 errors::InvalidArgument("tensor_in and out_grad_backprop must "
730 "have same number of elements, got <",
731 tensor_in.DebugString(), "> and <",
732 out_grad_backprop.DebugString(), ">"));
733 OP_REQUIRES(
734 context, tensor_out.NumElements() == output->NumElements(),
735 errors::InvalidArgument(
736 "tensor_out and output must have same number of elements, got <",
737 tensor_out.DebugString(), "> and <", output->DebugString(), ">"));
738
739 LaunchMaxPooling3dGradGradOp<Device, T>::launch(
740 context, params, tensor_in, tensor_out, out_grad_backprop, output);
741 }
742
743 private:
744 std::vector<int32> ksize_;
745 std::vector<int32> stride_;
746 Padding padding_;
747 TensorFormat data_format_;
748 };
749
750 #define REGISTER_KERNELS(D, T) \
751 REGISTER_KERNEL_BUILDER( \
752 Name("MaxPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"), \
753 Pooling3DOp<D##Device, T, MAX>); \
754 REGISTER_KERNEL_BUILDER(Name("MaxPool3DGrad") \
755 .Device(DEVICE_##D) \
756 .TypeConstraint<T>("T") \
757 .TypeConstraint<T>("TInput"), \
758 MaxPooling3dGradOp<D##Device, T>); \
759 REGISTER_KERNEL_BUILDER( \
760 Name("MaxPool3DGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
761 MaxPooling3dGradGradOp<D##Device, T>); \
762 REGISTER_KERNEL_BUILDER( \
763 Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"), \
764 Pooling3DOp<D##Device, T, AVG>); \
765 REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad") \
766 .Device(DEVICE_##D) \
767 .TypeConstraint<T>("T") \
768 .HostMemory("orig_input_shape"), \
769 AvgPooling3dGradOp<D##Device, T>);
770
771 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
772 TF_CALL_float(REGISTER_CPU_KERNELS);
773 #undef REGISTER_CPU_KERNELS
774
775 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
776
777 template <typename T>
778 struct LaunchPoolingOp<GPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp779 static void launch(OpKernelContext* context, const Tensor& tensor_in,
780 const std::array<int64, 3>& window,
781 const std::array<int64, 3>& stride,
782 const std::array<int64, 3>& padding,
783 TensorFormat data_format, Padding padding_type,
784 Tensor* output) {
785 DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
786 stride, padding, data_format, tensor_in, output);
787 }
788 };
789
790 template <typename T>
791 struct LaunchPoolingOp<GPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp792 static void launch(OpKernelContext* context, const Tensor& tensor_in,
793 const std::array<int64, 3>& window,
794 const std::array<int64, 3>& stride,
795 const std::array<int64, 3>& padding,
796 TensorFormat data_format, Padding padding_type,
797 Tensor* output) {
798 DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
799 stride, padding, data_format, tensor_in, output);
800 }
801 };
802
803 template <typename T>
804 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp805 static void launch(OpKernelContext* context, const Tensor& tensor_in,
806 const Tensor& tensor_out, const Tensor& out_backprop,
807 const std::array<int64, 3>& window,
808 const std::array<int64, 3>& stride,
809 const std::array<int64, 3>& out,
810 const std::array<int64, 3>& padding,
811 TensorFormat data_format, Tensor* input_backprop) {
812 const TensorShape output_shape = tensor_in.shape();
813 DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
814 window, stride, padding, out, data_format,
815 out_backprop, output_shape, &tensor_in,
816 &tensor_out, input_backprop);
817 }
818 };
819
820 template <typename T>
821 struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp822 static void launch(OpKernelContext* context,
823 const TensorShape& tensor_in_shape,
824 const Tensor& out_backprop,
825 const std::array<int64, 3>& window,
826 const std::array<int64, 3>& stride,
827 const std::array<int64, 3>& out,
828 const std::array<int64, 3>& padding,
829 TensorFormat data_format, Tensor* output) {
830 DnnPooling3dGradOp<T>::Compute(
831 context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
832 data_format, out_backprop, tensor_in_shape, nullptr, nullptr, output);
833 }
834 };
835
836 template <typename T>
837 struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp838 static void launch(OpKernelContext* context, const Pool3dParameters& params,
839 const Tensor& tensor_in, const Tensor& tensor_out,
840 const Tensor& tensor_top_diff,
841 Tensor* tensor_bottom_diff) {
842 bool status = functor::MaxPool3dGradBackward<T>()(
843 params.data_format, tensor_in.flat<T>().data(),
844 tensor_out.flat<T>().data(), params.tensor_in_batch, params.out_plane,
845 params.out_height, params.out_width, params.depth,
846 params.tensor_in_planes, params.tensor_in_rows, params.tensor_in_cols,
847 params.window_planes, params.window_rows, params.window_cols,
848 params.plane_stride, params.row_stride, params.col_stride,
849 params.pad_planes, params.pad_rows, params.pad_cols,
850 tensor_top_diff.flat<T>().data(), tensor_bottom_diff->flat<T>().data(),
851 context->eigen_gpu_device());
852 if (!status) {
853 context->SetStatus(
854 errors::Internal("Failed launching MaxPool3dGradBackward"));
855 }
856 }
857 };
858
859 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
860 TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
861 #undef REGISTER_GPU_KERNELS
862
863 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
864
865
866 #undef REGISTER_KERNELS
867
868 } // namespace tensorflow
869