1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // See docs in ../ops/nn_ops.cc.
17
18 #define EIGEN_USE_THREADS
19
20 #include "tensorflow/core/kernels/maxpooling_op.h"
21
22 #include <vector>
23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
24 #include "tensorflow/core/common_runtime/device.h"
25 #include "tensorflow/core/framework/bounds_check.h"
26 #include "tensorflow/core/framework/numeric_op.h"
27 #include "tensorflow/core/framework/op_kernel.h"
28 #include "tensorflow/core/framework/register_types.h"
29 #include "tensorflow/core/framework/tensor.h"
30 #include "tensorflow/core/framework/tensor_shape.h"
31 #include "tensorflow/core/framework/tensor_slice.h"
32 #include "tensorflow/core/kernels/conv_2d.h"
33 #include "tensorflow/core/kernels/eigen_pooling.h"
34 #include "tensorflow/core/kernels/ops_util.h"
35 #include "tensorflow/core/kernels/pooling_ops_common.h"
36 #include "tensorflow/core/lib/core/errors.h"
37 #include "tensorflow/core/lib/gtl/array_slice.h"
38 #include "tensorflow/core/util/env_var.h"
39 #include "tensorflow/core/util/padding.h"
40 #include "tensorflow/core/util/tensor_format.h"
41 #include "tensorflow/core/util/use_cudnn.h"
42
43 #if GOOGLE_CUDA
44 #include "cuda/include/cudnn.h"
45 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
46 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
47 #include "tensorflow/core/platform/stream_executor.h"
48 #endif // GOOGLE_CUDA
49
50 namespace tensorflow {
51
52 typedef Eigen::ThreadPoolDevice CPUDevice;
53 typedef Eigen::GpuDevice GPUDevice;
54
55 const int kInvalidMaxPoolingIndex = -1;
56
57 template <typename Device, typename T>
SpatialMaxPoolWithArgMaxHelper(OpKernelContext * context,Tensor * output,Tensor * output_arg_max,Tensor * input_backprop,const Tensor & tensor_in,const Tensor & out_backprop,const PoolParameters & params,const bool include_batch_in_index)58 static void SpatialMaxPoolWithArgMaxHelper(
59 OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
60 Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
61 const PoolParameters& params, const bool include_batch_in_index) {
62 if (input_backprop != nullptr) {
63 OP_REQUIRES(
64 context, include_batch_in_index,
65 errors::Internal(
66 "SpatialMaxPoolWithArgMaxHelper requires include_batch_in_index "
67 "to be True when when input_backprop != nullptr"));
68 }
69
70 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
71 ConstEigenMatrixMap;
72 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
73 EigenMatrixMap;
74 typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>>
75 EigenIndexMatrixMap;
76
77 ConstEigenMatrixMap in_mat(
78 tensor_in.flat<T>().data(), params.depth,
79 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
80 EigenMatrixMap out_mat(
81 output->flat<T>().data(), params.depth,
82 params.out_width * params.out_height * params.tensor_in_batch);
83 EigenIndexMatrixMap out_arg_max_mat(
84 output_arg_max->flat<int64>().data(), params.depth,
85 params.out_width * params.out_height * params.tensor_in_batch);
86
87 const DeviceBase::CpuWorkerThreads& worker_threads =
88 *(context->device()->tensorflow_cpu_worker_threads());
89
90 // The following code basically does the following:
91 // 1. Flattens the input and output tensors into two dimensional arrays.
92 // tensor_in_as_matrix:
93 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
94 // output_as_matrix:
95 // depth by (out_width * out_height * tensor_in_batch)
96 //
97 // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
98 // and updates the corresponding column(s) in output_as_matrix with the
99 // max value.
100 auto shard = [¶ms, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
101 &output_arg_max, &out_backprop,
102 include_batch_in_index](int64 start, int64 limit) {
103 const int32 depth = params.depth;
104 const int32 in_rows = params.tensor_in_rows;
105 const int32 in_cols = params.tensor_in_cols;
106 const int32 pad_rows = params.pad_rows;
107 const int32 pad_cols = params.pad_cols;
108 const int32 window_rows = params.window_rows;
109 const int32 window_cols = params.window_cols;
110 const int32 row_stride = params.row_stride;
111 const int32 col_stride = params.col_stride;
112 const int32 out_height = params.out_height;
113 const int32 out_width = params.out_width;
114
115 {
116 // Initializes the output tensor with MIN<T>.
117 const int32 output_image_size = out_height * out_width * depth;
118 EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1,
119 (limit - start) * output_image_size);
120 out_shard.setConstant(Eigen::NumTraits<T>::lowest());
121 EigenIndexMatrixMap out_arg_max_shard(
122 out_arg_max_mat.data() + start * output_image_size, 1,
123 (limit - start) * output_image_size);
124 out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
125 }
126
127 for (int64 b = start; b < limit; ++b) {
128 for (int h = 0; h < in_rows; ++h) {
129 for (int w = 0; w < in_cols; ++w) {
130 // (h_start, h_end) * (w_start, w_end) is the range that the input
131 // vector projects to.
132 const int hpad = h + pad_rows;
133 const int wpad = w + pad_cols;
134 const int h_start =
135 (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
136 const int h_end = std::min(hpad / row_stride + 1, out_height);
137 const int w_start =
138 (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
139 const int w_end = std::min(wpad / col_stride + 1, out_width);
140 // compute elementwise max
141 const int64 in_index = (b * in_rows + h) * in_cols + w;
142 for (int ph = h_start; ph < h_end; ++ph) {
143 const int64 out_index_base = (b * out_height + ph) * out_width;
144 for (int pw = w_start; pw < w_end; ++pw) {
145 const int64 out_index = out_index_base + pw;
146 /// NOTES(zhengxq): not using the eigen matrix operation for
147 /// now.
148 for (int d = 0; d < depth; ++d) {
149 const T& input_ref = in_mat.coeffRef(d, in_index);
150 T& output_ref = out_mat.coeffRef(d, out_index);
151 int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
152 if (output_ref < input_ref ||
153 out_arg_max_ref == kInvalidMaxPoolingIndex) {
154 output_ref = input_ref;
155 if (include_batch_in_index) {
156 out_arg_max_ref = in_index * depth + d;
157 } else {
158 out_arg_max_ref = (h * in_cols + w) * depth + d;
159 }
160 }
161 }
162 }
163 }
164 }
165 }
166 }
167
168 if (input_backprop != nullptr) {
169 auto input_backprop_flat = input_backprop->flat<T>();
170 auto out_arg_max_flat = output_arg_max->flat<int64>();
171 auto out_backprop_flat = out_backprop.flat<T>();
172
173 // Initialize output to 0.
174 const int64 in_size = in_rows * in_cols * depth;
175 const int64 in_start = start * in_size;
176 const int64 in_end = limit * in_size;
177 EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
178 in_end - in_start);
179 in_shard.setConstant(T(0));
180
181 // Backpropagate.
182 const int out_size = out_height * out_width * depth;
183 const int out_start = start * out_size;
184 const int out_end = limit * out_size;
185 for (int index = out_start; index < out_end; ++index) {
186 int input_backprop_index = out_arg_max_flat(index);
187 // Although this check is in the inner loop, it is worth its value
188 // so we don't end up with memory corruptions. Our benchmark shows that
189 // the performance impact is quite small
190 // CHECK(input_backprop_index >= in_start && input_backprop_index <
191 // in_end)
192 FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
193 input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
194 }
195 }
196 };
197
198 const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
199 params.depth * params.window_rows *
200 params.window_cols;
201 Shard(worker_threads.num_threads, worker_threads.workers,
202 params.tensor_in_batch, shard_cost, shard);
203 }
204
205 // The operation to compute MaxPool gradients.
206 // It takes three inputs:
207 // - The original input tensor
208 // - The original output tensor
209 // - Backprop tensor for output
210 // It produces one output: backprop tensor for input.
211 template <class Device, class T>
212 class MaxPoolingGradOp : public OpKernel {
213 public:
MaxPoolingGradOp(OpKernelConstruction * context)214 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
215 string data_format;
216 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
217 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
218 errors::InvalidArgument("Invalid data format"));
219 OP_REQUIRES(
220 context, data_format_ == FORMAT_NHWC,
221 errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
222 "on device type ",
223 DeviceTypeString(context->device_type())));
224
225 if (context->num_inputs() == 3) {
226 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
227 OP_REQUIRES(context, ksize_.size() == 4,
228 errors::InvalidArgument("Sliding window ksize field must "
229 "specify 4 dimensions"));
230 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
231 OP_REQUIRES(context, stride_.size() == 4,
232 errors::InvalidArgument("Sliding window strides field must "
233 "specify 4 dimensions"));
234 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
235 errors::Unimplemented(
236 "Pooling is not yet supported on the batch dimension."));
237 OP_REQUIRES(
238 context, ksize_[3] == 1 && stride_[3] == 1,
239 errors::Unimplemented(
240 "MaxPoolingGrad is not yet supported on the depth dimension."));
241 }
242
243 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
244 }
245
Compute(OpKernelContext * context)246 void Compute(OpKernelContext* context) override {
247 const Tensor& tensor_in = context->input(0);
248 const Tensor& tensor_out = context->input(1);
249 const Tensor& out_backprop = context->input(2);
250
251 // For maxpooling, tensor_in should have 4 dimensions.
252 OP_REQUIRES(context, tensor_in.dims() == 4,
253 errors::InvalidArgument("tensor_in must be 4-dimensional"));
254 OP_REQUIRES(context, tensor_out.dims() == 4,
255 errors::InvalidArgument("tensor_out must be 4-dimensional"));
256 // For maxpooling, out_backprop should have 4 dimensions.
257 OP_REQUIRES(context, out_backprop.dims() == 4,
258 errors::InvalidArgument("out_backprop must be 4-dimensional"));
259
260 const TensorShape& output_shape = tensor_in.shape();
261
262 Tensor tensor_out_dup;
263 OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
264 {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
265 &tensor_out_dup));
266 Tensor tensor_out_arg_max;
267 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
268 tensor_out.shape(),
269 &tensor_out_arg_max));
270 std::vector<int32> ksize = ksize_;
271 std::vector<int32> stride = stride_;
272 if (context->num_inputs() == 5) {
273 const Tensor& tensor_ksize = context->input(3);
274 auto value_ksize = tensor_ksize.flat<int32>();
275 ksize.resize(tensor_ksize.shape().num_elements());
276 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
277
278 const Tensor& tensor_stride = context->input(4);
279 auto value_stride = tensor_stride.flat<int32>();
280 stride.resize(tensor_stride.shape().num_elements());
281 std::copy_n(&value_stride(0), stride.size(), stride.begin());
282 }
283
284 OP_REQUIRES(context, ksize.size() == 4,
285 errors::InvalidArgument("Sliding window ksize field must "
286 "specify 4 dimensions"));
287 OP_REQUIRES(context, stride.size() == 4,
288 errors::InvalidArgument("Sliding window strides field must "
289 "specify 4 dimensions"));
290 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
291 errors::Unimplemented(
292 "Pooling is not yet supported on the batch dimension."));
293 OP_REQUIRES(
294 context, ksize[3] == 1 && stride[3] == 1,
295 errors::Unimplemented(
296 "MaxPoolingGrad is not yet supported on the depth dimension."));
297
298 PoolParameters params{context, ksize, stride,
299 padding_, FORMAT_NHWC, tensor_in.shape()};
300 if (!context->status().ok()) {
301 return;
302 }
303
304 Tensor* output = nullptr;
305 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
306 {0}, 0, output_shape, &output));
307
308 SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
309 context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
310 out_backprop, params, true);
311 }
312
313 private:
314 std::vector<int32> ksize_;
315 std::vector<int32> stride_;
316 Padding padding_;
317 TensorFormat data_format_;
318 };
319
320 #ifdef GOOGLE_CUDA
321
322 template <typename T>
MaxPoolingBackwardCustomKernel(OpKernelContext * context,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,const Tensor * tensor_in,const Tensor & out_backprop,const TensorShape & tensor_in_shape)323 static void MaxPoolingBackwardCustomKernel(
324 OpKernelContext* context, const std::vector<int32>& size,
325 const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
326 const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
327 Tensor* output = nullptr;
328 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
329 {0}, 0, tensor_in_shape, &output));
330
331 PoolParameters params{context, size, stride,
332 padding, FORMAT_NHWC, tensor_in_shape};
333 if (!context->status().ok()) {
334 return;
335 }
336
337 functor::MaxPoolBackwardNoMask<T>()(
338 tensor_in->flat<T>().data(), params.tensor_in_batch,
339 params.tensor_in_rows, params.tensor_in_cols, params.depth,
340 params.out_height, params.out_width, params.window_rows,
341 params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
342 params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
343 context->eigen_device<Eigen::GpuDevice>());
344 }
345
346 template <class T>
347 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
348 public:
349 typedef Eigen::GpuDevice Device;
350
MaxPoolingGradOp(OpKernelConstruction * context)351 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
352 string data_format;
353 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
354 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
355 errors::InvalidArgument("Invalid data format"));
356 if (context->num_inputs() == 3) {
357 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
358 OP_REQUIRES(context, ksize_.size() == 4,
359 errors::InvalidArgument("Sliding window ksize field must "
360 "specify 4 dimensions"));
361 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
362 OP_REQUIRES(context, stride_.size() == 4,
363 errors::InvalidArgument("Sliding window strides field must "
364 "specify 4 dimensions"));
365 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
366 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
367 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
368 errors::Unimplemented(
369 "Pooling is not yet supported on the batch dimension."));
370 }
371 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
372
373 use_dnn_ = CanUseCudnn();
374 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
375 &propagate_nans_));
376 }
377
Compute(OpKernelContext * context)378 void Compute(OpKernelContext* context) override {
379 const Tensor& tensor_in = context->input(0);
380 const Tensor& tensor_out = context->input(1);
381 const Tensor& out_backprop = context->input(2);
382
383 // For maxpooling, tensor_in should have 4 dimensions.
384 OP_REQUIRES(context, tensor_in.dims() == 4,
385 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
386 OP_REQUIRES(context, tensor_out.dims() == 4,
387 errors::InvalidArgument("tensor_out must be 4-dimensional"));
388 // For maxpooling, out_backprop should have 4 dimensions.
389 OP_REQUIRES(context, out_backprop.dims() == 4,
390 errors::InvalidArgument("out_backprop must be 4-dimensional"));
391
392 TensorShape output_shape = tensor_in.shape();
393
394 std::vector<int32> ksize = ksize_;
395 std::vector<int32> stride = stride_;
396 if (context->num_inputs() == 5) {
397 const Tensor& tensor_ksize = context->input(3);
398 auto value_ksize = tensor_ksize.flat<int32>();
399 ksize.resize(tensor_ksize.shape().num_elements());
400 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
401
402 const Tensor& tensor_stride = context->input(4);
403 auto value_stride = tensor_stride.flat<int32>();
404 stride.resize(tensor_stride.shape().num_elements());
405 std::copy_n(&value_stride(0), stride.size(), stride.begin());
406 }
407 OP_REQUIRES(context, ksize.size() == 4,
408 errors::InvalidArgument("Sliding window ksize field must "
409 "specify 4 dimensions"));
410 OP_REQUIRES(context, stride.size() == 4,
411 errors::InvalidArgument("Sliding window strides field must "
412 "specify 4 dimensions"));
413 const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
414 const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
415 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
416 errors::Unimplemented(
417 "Pooling is not yet supported on the batch dimension."));
418
419 if (use_dnn_) {
420 DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
421 ksize, stride, padding_, data_format_,
422 &tensor_in, &tensor_out, out_backprop,
423 output_shape, propagate_nans_);
424 } else {
425 CHECK(data_format_ == FORMAT_NHWC)
426 << "Non-Cudnn MaxPoolGrad only supports NHWC format";
427 MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_,
428 &tensor_in, out_backprop, output_shape);
429 }
430 }
431
432 private:
433 std::vector<int32> ksize_;
434 std::vector<int32> stride_;
435 Padding padding_;
436 TensorFormat data_format_;
437 bool use_dnn_;
438 bool propagate_nans_;
439 };
440
441 #endif // GOOGLE_CUDA
442
443 // The operation to compute gradient of MaxPool gradients.
444 // It takes three inputs:
445 // - The original input tensor
446 // - The original output tensor
447 // - Backprop tensor for output gradients
448 // It produces one output: backprop tensor for output gradient.
449 template <class Device, class T>
450 class MaxPoolingGradGradOp : public OpKernel {
451 public:
MaxPoolingGradGradOp(OpKernelConstruction * context)452 explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
453 : OpKernel(context) {
454 string data_format;
455 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
456 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
457 errors::InvalidArgument("Invalid data format"));
458 OP_REQUIRES(
459 context, data_format_ == FORMAT_NHWC,
460 errors::InvalidArgument(
461 "Default MaxPoolingGradGradOp only supports NHWC ",
462 "on device type ", DeviceTypeString(context->device_type())));
463
464 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
465
466 if (context->num_inputs() == 3) {
467 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
468 OP_REQUIRES(context, ksize_.size() == 4,
469 errors::InvalidArgument("Sliding window ksize field must "
470 "specify 4 dimensions"));
471 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
472 OP_REQUIRES(context, stride_.size() == 4,
473 errors::InvalidArgument("Sliding window strides field must "
474 "specify 4 dimensions"));
475 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
476 errors::Unimplemented(
477 "Pooling is not yet supported on the batch dimension."));
478 OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1,
479 errors::Unimplemented("MaxPoolingGradGrad is not yet "
480 "supported on the depth dimension."));
481 }
482 }
483
Compute(OpKernelContext * context)484 void Compute(OpKernelContext* context) override {
485 const Tensor& tensor_in = context->input(0);
486 const Tensor& tensor_out = context->input(1);
487 const Tensor& out_grad_backprop = context->input(2);
488
489 // For maxpooling, tensor_in should have 4 dimensions.
490 OP_REQUIRES(context, tensor_in.dims() == 4,
491 errors::InvalidArgument("tensor_in must be 4-dimensional"));
492 OP_REQUIRES(context, tensor_out.dims() == 4,
493 errors::InvalidArgument("tensor_out must be 4-dimensional"));
494 // For maxpooling, out_grad_backprop should have 4 dimensions.
495 OP_REQUIRES(
496 context, out_grad_backprop.dims() == 4,
497 errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
498
499 std::vector<int32> ksize = ksize_;
500 std::vector<int32> stride = stride_;
501 if (context->num_inputs() == 5) {
502 const Tensor& tensor_ksize = context->input(3);
503 auto value_ksize = tensor_ksize.flat<int32>();
504 ksize.resize(tensor_ksize.shape().num_elements());
505 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
506
507 const Tensor& tensor_stride = context->input(4);
508 auto value_stride = tensor_stride.flat<int32>();
509 stride.resize(tensor_stride.shape().num_elements());
510 std::copy_n(&value_stride(0), stride.size(), stride.begin());
511 }
512
513 OP_REQUIRES(context, ksize.size() == 4,
514 errors::InvalidArgument("Sliding window ksize field must "
515 "specify 4 dimensions"));
516 OP_REQUIRES(context, stride.size() == 4,
517 errors::InvalidArgument("Sliding window strides field must "
518 "specify 4 dimensions"));
519 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
520 errors::Unimplemented(
521 "Pooling is not yet supported on the batch dimension."));
522 OP_REQUIRES(
523 context, ksize[3] == 1 && stride[3] == 1,
524 errors::Unimplemented(
525 "MaxPoolingGrad is not yet supported on the depth dimension."));
526
527 PoolParameters params{context, ksize, stride,
528 padding_, FORMAT_NHWC, tensor_in.shape()};
529 Tensor* output = nullptr;
530 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
531 {2}, 0, tensor_out.shape(), &output));
532
533 SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
534 out_grad_backprop, params, padding_);
535 }
536
537 private:
SpatialMaxPoolGradGrad(OpKernelContext * context,Tensor * bottom_diff,const Tensor & tensor_in,const Tensor & tensor_out,const Tensor & top_diff,const PoolParameters & params,const Padding & padding)538 void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
539 const Tensor& tensor_in, const Tensor& tensor_out,
540 const Tensor& top_diff,
541 const PoolParameters& params,
542 const Padding& padding) {
543 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
544 ConstEigenMatrixMap;
545 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
546 EigenMatrixMap;
547
548 ConstEigenMatrixMap in_mat(
549 tensor_in.flat<T>().data(), params.depth,
550 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
551 ConstEigenMatrixMap out_mat(
552 tensor_out.flat<T>().data(), params.depth,
553 params.out_width * params.out_height * params.tensor_in_batch);
554 ConstEigenMatrixMap top_diff_mat(
555 top_diff.flat<T>().data(), params.depth,
556 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
557 EigenMatrixMap bottom_diff_mat(
558 bottom_diff->flat<T>().data(), params.depth,
559 params.out_width * params.out_height * params.tensor_in_batch);
560
561 const DeviceBase::CpuWorkerThreads& worker_threads =
562 *(context->device()->tensorflow_cpu_worker_threads());
563
564 // The following code basically does the following:
565 // 1. Flattens the input, output, top_diff and bottom_diff tensors into
566 // two dimensional arrays.
567 // tensor_in_as_matrix:
568 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
569 // tensor_out_as_matrix:
570 // depth by (out_width * out_height * tensor_in_batch)
571 // top_diff_as_matrix:
572 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
573 // bottom_diff_as_matrix:
574 // depth by (out_width * out_height * tensor_in_batch)
575 //
576 // 2. Walks through the set of columns in the flattened
577 // tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
578 // and updates the column(s) corresponding to the maximum values in
579 // tensor_out_as_matrix with the corresponding values in
580 // top_diff_as_matrix.
581 auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
582 int64 start, int64 limit) {
583 const int32 depth = params.depth;
584 const int32 in_rows = params.tensor_in_rows;
585 const int32 in_cols = params.tensor_in_cols;
586 const int32 pad_rows = params.pad_rows;
587 const int32 pad_cols = params.pad_cols;
588 const int32 window_rows = params.window_rows;
589 const int32 window_cols = params.window_cols;
590 const int32 row_stride = params.row_stride;
591 const int32 col_stride = params.col_stride;
592 const int32 out_height = params.out_height;
593 const int32 out_width = params.out_width;
594
595 {
596 // Initializes the output grad backprop tensor with 0.
597 const int32 output_image_size = out_height * out_width * params.depth;
598 EigenMatrixMap bottom_diff_shard(
599 bottom_diff_mat.data() + start * output_image_size, 1,
600 (limit - start) * output_image_size);
601 bottom_diff_shard.setZero();
602 }
603
604 for (int b = start; b < limit; ++b) {
605 for (int ph = 0; ph < out_height; ++ph) {
606 for (int pw = 0; pw < out_width; ++pw) {
607 // (h_start, h_end) * (w_start, w_end) is the range that the input
608 // vector projects to.
609 int h_start = ph * row_stride - pad_rows;
610 const int h_end = std::min(h_start + window_rows, in_rows);
611 int w_start = pw * col_stride - pad_cols;
612 const int w_end = std::min(w_start + window_cols, in_cols);
613 h_start = std::max(h_start, 0);
614 w_start = std::max(w_start, 0);
615 const int out_index = (b * out_height + ph) * out_width + pw;
616 // Find value corresponding to the input maximum in top_diff.
617 for (int d = 0; d < depth; ++d) {
618 const T& output_ref = out_mat.coeffRef(d, out_index);
619 bool should_stop = false;
620 for (int h = h_start; h < h_end && !should_stop; ++h) {
621 for (int w = w_start; w < w_end && !should_stop; ++w) {
622 const int in_index = (b * in_rows + h) * in_cols + w;
623 const T& input_ref = in_mat.coeffRef(d, in_index);
624 if (output_ref == input_ref) {
625 T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
626 bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
627 should_stop = true;
628 }
629 }
630 }
631 }
632 }
633 }
634 }
635 };
636
637 const int64 shard_cost = params.out_width * params.out_height *
638 params.depth * params.window_rows *
639 params.window_cols;
640 Shard(worker_threads.num_threads, worker_threads.workers,
641 params.tensor_in_batch, shard_cost, shard);
642 }
643
644 std::vector<int32> ksize_;
645 std::vector<int32> stride_;
646 Padding padding_;
647 TensorFormat data_format_;
648 };
649
650 #ifdef GOOGLE_CUDA
651
652 template <class T>
653 class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
654 public:
655 typedef Eigen::GpuDevice Device;
656
MaxPoolingGradGradOp(OpKernelConstruction * context)657 explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
658 : OpKernel(context) {
659 string data_format;
660 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
661 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
662 errors::InvalidArgument("Invalid data format"));
663 if (context->num_inputs() == 3) {
664 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
665 OP_REQUIRES(context, ksize_.size() == 4,
666 errors::InvalidArgument("Sliding window ksize field must "
667 "specify 4 dimensions"));
668 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
669 OP_REQUIRES(context, stride_.size() == 4,
670 errors::InvalidArgument("Sliding window strides field must "
671 "specify 4 dimensions"));
672 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
673 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
674 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
675 errors::Unimplemented(
676 "Pooling is not yet supported on the batch dimension."));
677 }
678 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
679 }
680
Compute(OpKernelContext * context)681 void Compute(OpKernelContext* context) override {
682 const Tensor& tensor_in = context->input(0);
683 const Tensor& tensor_out = context->input(1);
684 const Tensor& out_grad_backprop = context->input(2);
685
686 // For maxpooling, tensor_in should have 4 dimensions.
687 OP_REQUIRES(context, tensor_in.dims() == 4,
688 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
689 OP_REQUIRES(context, tensor_out.dims() == 4,
690 errors::InvalidArgument("tensor_out must be 4-dimensional"));
691 // For maxpooling, out_grad_backprop should have 4 dimensions.
692 OP_REQUIRES(
693 context, out_grad_backprop.dims() == 4,
694 errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
695
696 Tensor* output = nullptr;
697 OP_REQUIRES_OK(context,
698 context->allocate_output(0, tensor_out.shape(), &output));
699
700 std::vector<int32> ksize = ksize_;
701 std::vector<int32> stride = stride_;
702 if (context->num_inputs() == 5) {
703 const Tensor& tensor_ksize = context->input(3);
704 auto value_ksize = tensor_ksize.flat<int32>();
705 ksize.resize(tensor_ksize.shape().num_elements());
706 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
707
708 const Tensor& tensor_stride = context->input(4);
709 auto value_stride = tensor_stride.flat<int32>();
710 stride.resize(tensor_stride.shape().num_elements());
711 std::copy_n(&value_stride(0), stride.size(), stride.begin());
712 }
713
714 OP_REQUIRES(context, ksize.size() == 4,
715 errors::InvalidArgument("Sliding window ksize field must "
716 "specify 4 dimensions"));
717 OP_REQUIRES(context, stride.size() == 4,
718 errors::InvalidArgument("Sliding window strides field must "
719 "specify 4 dimensions"));
720 const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
721 const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
722 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
723 errors::Unimplemented(
724 "Pooling is not yet supported on the batch dimension."));
725
726 PoolParameters params{context, ksize, stride,
727 padding_, data_format_, tensor_in.shape()};
728
729 functor::MaxPoolGradBackwardNoMask<T>()(
730 data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
731 params.tensor_in_batch, params.out_height, params.out_width,
732 params.depth, params.tensor_in_rows, params.tensor_in_cols,
733 params.window_rows, params.window_cols, params.row_stride,
734 params.col_stride, params.pad_rows, params.pad_cols,
735 out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
736 context->eigen_device<Eigen::GpuDevice>());
737 }
738
739 private:
740 std::vector<int32> ksize_;
741 std::vector<int32> stride_;
742 Padding padding_;
743 TensorFormat data_format_;
744 bool use_dnn_;
745 };
746
747 #endif // GOOGLE_CUDA
748
749 template <typename Device, typename T>
750 struct LaunchMaxPoolingNoMask;
751
752 template <typename Device, typename T>
753 class MaxPoolingNoMaskOp : public OpKernel {
754 public:
MaxPoolingNoMaskOp(OpKernelConstruction * context)755 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
756 : OpKernel(context) {
757 string data_format;
758 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
759 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
760 errors::InvalidArgument("Invalid data format"));
761 OP_REQUIRES(
762 context, data_format_ == FORMAT_NHWC,
763 errors::InvalidArgument(
764 "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
765 DeviceTypeString(context->device_type())));
766 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
767 OP_REQUIRES(context, ksize_.size() == 4,
768 errors::InvalidArgument("Sliding window ksize field must "
769 "specify 4 dimensions"));
770 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
771 OP_REQUIRES(context, stride_.size() == 4,
772 errors::InvalidArgument("Sliding window stride field must "
773 "specify 4 dimensions"));
774 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
775 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
776 errors::Unimplemented(
777 "Pooling is not yet supported on the batch dimension."));
778 }
779
Compute(OpKernelContext * context)780 void Compute(OpKernelContext* context) override {
781 const Tensor& tensor_in = context->input(0);
782
783 PoolParameters params{context, ksize_, stride_,
784 padding_, data_format_, tensor_in.shape()};
785 if (!context->status().ok()) {
786 return;
787 }
788
789 TensorShape out_shape({params.tensor_in_batch, params.out_height,
790 params.out_width, params.depth});
791 Tensor* output = nullptr;
792 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
793
794 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
795 output);
796 }
797
798 private:
799 std::vector<int32> ksize_;
800 std::vector<int32> stride_;
801 Padding padding_;
802 TensorFormat data_format_;
803 };
804
805 template <typename Device, typename T>
806 class MaxPoolingNoMaskV2Op : public OpKernel {
807 public:
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)808 explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
809 : OpKernel(context) {
810 string data_format;
811 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
812 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
813 errors::InvalidArgument("Invalid data format"));
814 OP_REQUIRES(
815 context, data_format_ == FORMAT_NHWC,
816 errors::InvalidArgument(
817 "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
818 DeviceTypeString(context->device_type())));
819 if (context->num_inputs() == 1) {
820 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
821 OP_REQUIRES(context, ksize_.size() == 4,
822 errors::InvalidArgument("Sliding window ksize field must "
823 "specify 4 dimensions"));
824 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
825 OP_REQUIRES(context, stride_.size() == 4,
826 errors::InvalidArgument("Sliding window stride field must "
827 "specify 4 dimensions"));
828 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
829 errors::Unimplemented(
830 "Pooling is not yet supported on the batch dimension."));
831 }
832 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
833 }
834
Compute(OpKernelContext * context)835 void Compute(OpKernelContext* context) override {
836 const Tensor& tensor_in = context->input(0);
837
838 std::vector<int32> ksize = ksize_;
839 std::vector<int32> stride = stride_;
840
841 if (context->num_inputs() != 1) {
842 const Tensor& tensor_ksize = context->input(1);
843 auto value_ksize = tensor_ksize.flat<int32>();
844 ksize.resize(tensor_ksize.shape().num_elements());
845 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
846
847 const Tensor& tensor_stride = context->input(2);
848 auto value_stride = tensor_stride.flat<int32>();
849 stride.resize(tensor_stride.shape().num_elements());
850 std::copy_n(&value_stride(0), stride.size(), stride.begin());
851 }
852 OP_REQUIRES(context, ksize.size() == 4,
853 errors::InvalidArgument("Sliding window ksize field must "
854 "specify 4 dimensions"));
855 OP_REQUIRES(context, stride.size() == 4,
856 errors::InvalidArgument("Sliding window stride field must "
857 "specify 4 dimensions"));
858 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
859 errors::Unimplemented(
860 "Pooling is not yet supported on the batch dimension."));
861 PoolParameters params{context, ksize, stride,
862 padding_, data_format_, tensor_in.shape()};
863 if (!context->status().ok()) {
864 return;
865 }
866
867 TensorShape out_shape({params.tensor_in_batch, params.out_height,
868 params.out_width, params.depth});
869 Tensor* output = nullptr;
870 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
871
872 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
873 output);
874 }
875
876 private:
877 std::vector<int32> ksize_;
878 std::vector<int32> stride_;
879 Padding padding_;
880 TensorFormat data_format_;
881 };
882
883 template <typename Device, typename T>
884 struct LaunchMaxPoolingWithArgmax;
885
886 template <typename T>
887 struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
launchtensorflow::LaunchMaxPoolingWithArgmax888 static void launch(OpKernelContext* context, const PoolParameters& params,
889 const Tensor& input, Tensor* output, Tensor* argmax,
890 bool propagate_nans, bool include_batch_in_index) {
891 Tensor unused;
892 SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(context, output, argmax,
893 nullptr, input, unused, params,
894 include_batch_in_index);
895 }
896 };
897
898 template <typename Device, typename T>
899 class MaxPoolingWithArgmaxOp : public OpKernel {
900 public:
MaxPoolingWithArgmaxOp(OpKernelConstruction * context)901 explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
902 : OpKernel(context) {
903 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
904 OP_REQUIRES(context, ksize_.size() == 4,
905 errors::InvalidArgument("Sliding window ksize field must "
906 "specify 4 dimensions"));
907 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
908 OP_REQUIRES(context, stride_.size() == 4,
909 errors::InvalidArgument("Sliding window stride field must "
910 "specify 4 dimensions"));
911 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
912 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
913 errors::Unimplemented(
914 "Pooling is not yet supported on the batch dimension."));
915 OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
916 &include_batch_in_index_));
917 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
918 &propagate_nans_));
919 }
920
Compute(OpKernelContext * context)921 void Compute(OpKernelContext* context) override {
922 const Tensor& tensor_in = context->input(0);
923
924 PoolParameters params{context, ksize_, stride_,
925 padding_, FORMAT_NHWC, tensor_in.shape()};
926 if (!context->status().ok()) {
927 return;
928 }
929
930 TensorShape out_shape({params.tensor_in_batch, params.out_height,
931 params.out_width, params.depth});
932 Tensor* output = nullptr;
933 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
934 Tensor* argmax = nullptr;
935 OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
936
937 LaunchMaxPoolingWithArgmax<Device, T>::launch(
938 context, params, tensor_in, output, argmax, propagate_nans_,
939 include_batch_in_index_);
940 }
941
942 private:
943 std::vector<int32> ksize_;
944 std::vector<int32> stride_;
945 Padding padding_;
946 bool propagate_nans_;
947 bool include_batch_in_index_;
948 };
949
950 template <typename Device, typename T>
951 struct LaunchMaxPoolingGradWithArgmax;
952
953 template <typename T>
954 struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
955 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
956 EigenMatrixMap;
957
launchtensorflow::LaunchMaxPoolingGradWithArgmax958 static void launch(OpKernelContext* context, const PoolParameters& params,
959 const Tensor& grad_in, const Tensor& argmax,
960 Tensor* grad_out, const bool include_batch_in_index) {
961 const DeviceBase::CpuWorkerThreads& worker_threads =
962 *(context->device()->tensorflow_cpu_worker_threads());
963
964 auto shard = [&grad_in, &argmax, &grad_out, include_batch_in_index](
965 int64 start, int64 limit) {
966 const int64 batch_size =
967 GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
968 const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
969 const int64 input_size_per_batch = grad_in.NumElements() / batch_size;
970
971 {
972 auto grad_out_flat = grad_out->flat<T>();
973 auto argmax_flat = argmax.flat<int64>();
974 auto grad_in_flat = grad_in.flat<T>();
975
976 const int64 output_start = start * output_size_per_batch;
977 const int64 output_end = limit * output_size_per_batch;
978 EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
979 output_end - output_start);
980 inputShard.setConstant(T(0));
981
982 const int input_start = start * input_size_per_batch;
983 const int input_end = limit * input_size_per_batch;
984 for (int64 index = input_start; index < input_end; index++) {
985 int64 grad_out_index = argmax_flat(index);
986 if (!include_batch_in_index) {
987 const int64 cur_batch = index / input_size_per_batch;
988 grad_out_index += cur_batch * output_size_per_batch;
989 }
990 CHECK(grad_out_index >= output_start && grad_out_index < output_end)
991 << "Invalid output gradient index: " << grad_out_index << ", "
992 << output_start << ", " << output_end;
993 grad_out_flat(grad_out_index) += grad_in_flat(index);
994 }
995 }
996 };
997
998 const int64 batch_size = GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
999 const int64 shard_cost = grad_out->NumElements() / batch_size;
1000 Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
1001 shard_cost, shard);
1002 }
1003 };
1004
1005 template <typename Device, typename T>
1006 class MaxPoolingGradWithArgmaxOp : public OpKernel {
1007 public:
MaxPoolingGradWithArgmaxOp(OpKernelConstruction * context)1008 explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
1009 : OpKernel(context) {
1010 string data_format_str;
1011 auto status = context->GetAttr("data_format", &data_format_str);
1012 if (status.ok()) {
1013 OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
1014 errors::InvalidArgument("Invalid data format"));
1015 }
1016
1017 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1018 OP_REQUIRES(context, ksize_.size() == 4,
1019 errors::InvalidArgument("Sliding window ksize field must "
1020 "specify 4 dimensions"));
1021 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1022 OP_REQUIRES(context, stride_.size() == 4,
1023 errors::InvalidArgument("Sliding window stride field must "
1024 "specify 4 dimensions"));
1025 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1026 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1027 errors::Unimplemented(
1028 "Pooling is not yet supported on the batch dimension."));
1029 OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1030 &include_batch_in_index_));
1031 }
1032
Compute(OpKernelContext * context)1033 void Compute(OpKernelContext* context) override {
1034 const Tensor& tensor_in = context->input(0);
1035 const Tensor& grad_in = context->input(1);
1036 const Tensor& argmax = context->input(2);
1037
1038 PoolParameters params{context, ksize_, stride_,
1039 padding_, FORMAT_NHWC, tensor_in.shape()};
1040 if (!context->status().ok()) {
1041 return;
1042 }
1043
1044 TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
1045 params.tensor_in_cols, params.depth});
1046 Tensor* grad_out = nullptr;
1047 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1048 {0}, 0, out_shape, &grad_out));
1049
1050 LaunchMaxPoolingGradWithArgmax<Device, T>::launch(
1051 context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1052 }
1053
1054 private:
1055 std::vector<int32> ksize_;
1056 std::vector<int32> stride_;
1057 Padding padding_;
1058 TensorFormat data_format_;
1059 bool include_batch_in_index_;
1060 };
1061
1062 template <typename Device, typename T>
1063 struct LaunchMaxPoolingGradGradWithArgmax;
1064
1065 template <typename Device, typename T>
1066 class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
1067 public:
MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction * context)1068 explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
1069 : OpKernel(context) {
1070 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1071 OP_REQUIRES(context, ksize_.size() == 4,
1072 errors::InvalidArgument("Sliding window ksize field must "
1073 "specify 4 dimensions"));
1074 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1075 OP_REQUIRES(context, stride_.size() == 4,
1076 errors::InvalidArgument("Sliding window stride field must "
1077 "specify 4 dimensions"));
1078 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1079 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
1080 errors::Unimplemented(
1081 "Pooling is not yet supported on the batch dimension."));
1082 OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
1083 &include_batch_in_index_));
1084 }
1085
Compute(OpKernelContext * context)1086 void Compute(OpKernelContext* context) override {
1087 const Tensor& tensor_in = context->input(0);
1088 const Tensor& grad_in = context->input(1);
1089 const Tensor& argmax = context->input(2);
1090
1091 PoolParameters params{context, ksize_, stride_,
1092 padding_, FORMAT_NHWC, tensor_in.shape()};
1093 if (!context->status().ok()) {
1094 return;
1095 }
1096
1097 TensorShape out_shape({params.tensor_in_batch, params.out_height,
1098 params.out_width, params.depth});
1099
1100 Tensor* grad_out = nullptr;
1101 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1102 {0}, 0, out_shape, &grad_out));
1103
1104 LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
1105 context, params, grad_in, argmax, grad_out, include_batch_in_index_);
1106 }
1107
1108 private:
1109 std::vector<int32> ksize_;
1110 std::vector<int32> stride_;
1111 Padding padding_;
1112 bool include_batch_in_index_;
1113 };
1114
1115 #if GOOGLE_CUDA
1116 template <typename T>
1117 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
1118 public:
1119 typedef GPUDevice Device;
MaxPoolingNoMaskOp(OpKernelConstruction * context)1120 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
1121 : OpKernel(context) {
1122 string data_format;
1123 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1124 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1125 errors::InvalidArgument("Invalid data format"));
1126 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1127 OP_REQUIRES(context, ksize_.size() == 4,
1128 errors::InvalidArgument("Sliding window ksize field must "
1129 "specify 4 dimensions"));
1130 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1131 OP_REQUIRES(context, stride_.size() == 4,
1132 errors::InvalidArgument("Sliding window stride field must "
1133 "specify 4 dimensions"));
1134 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1135 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1136 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
1137 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1138 errors::Unimplemented(
1139 "Pooling is not yet supported on the batch dimension."));
1140 use_dnn_ = CanUseCudnn();
1141
1142 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1143 &propagate_nans_));
1144 }
1145
Compute(OpKernelContext * context)1146 void Compute(OpKernelContext* context) override {
1147 const Tensor& tensor_in = context->input(0);
1148
1149 PoolParameters params{context, ksize_, stride_,
1150 padding_, data_format_, tensor_in.shape()};
1151 if (!context->status().ok()) {
1152 return;
1153 }
1154
1155 TensorShape out_shape =
1156 ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1157 params.out_width, params.depth);
1158
1159 // Assuming qint8 <--> NCHW_VECT_C (int8x4) here.
1160 constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
1161 OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
1162 errors::InvalidArgument(
1163 "qint8 should be used with data_format NCHW_VECT_C."));
1164
1165 #if CUDNN_VERSION >= 7300
1166 if (use_dnn_) {
1167 DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1168 stride_, padding_, data_format_, tensor_in,
1169 out_shape, propagate_nans_);
1170 #else
1171 // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
1172 if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
1173 DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
1174 stride_, padding_, data_format_, tensor_in,
1175 out_shape, propagate_nans_);
1176 #endif
1177 } else {
1178 Tensor* output = nullptr;
1179 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1180 if (is_int8x4) {
1181 LaunchMaxPoolingNoMask_NCHW_VECT_C<Device>::launch(context, params,
1182 tensor_in, output);
1183 } else if (data_format_ == FORMAT_NHWC) {
1184 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1185 output, propagate_nans_);
1186 } else {
1187 LOG(FATAL) << "MaxPool currently only supports the following (layout, "
1188 "type) combinations: (NHWC, non-qint8), "
1189 "(NCHW, non-qint8) or (NCHW_VECT_C, qint8). The "
1190 "requested combination ("
1191 << ToString(data_format_) << ", "
1192 << DataTypeString(DataTypeToEnum<T>::v())
1193 << ") is not supported.";
1194 }
1195 }
1196 }
1197
1198 private:
1199 std::vector<int32> ksize_;
1200 std::vector<int32> stride_;
1201 Padding padding_;
1202 TensorFormat data_format_;
1203 bool use_dnn_;
1204 bool propagate_nans_;
1205 };
1206
1207 template <typename T>
1208 class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
1209 public:
1210 typedef GPUDevice Device;
MaxPoolingNoMaskV2Op(OpKernelConstruction * context)1211 explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
1212 : OpKernel(context) {
1213 string data_format;
1214 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1215 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1216 errors::InvalidArgument("Invalid data format"));
1217 if (context->num_inputs() == 1) {
1218 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
1219 OP_REQUIRES(context, ksize_.size() == 4,
1220 errors::InvalidArgument("Sliding window ksize field must "
1221 "specify 4 dimensions"));
1222 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
1223 OP_REQUIRES(context, stride_.size() == 4,
1224 errors::InvalidArgument("Sliding window stride field must "
1225 "specify 4 dimensions"));
1226 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
1227 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
1228 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1229 errors::Unimplemented(
1230 "Pooling is not yet supported on the batch dimension."));
1231 }
1232 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1233 use_dnn_ = CanUseCudnn();
1234 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
1235 &propagate_nans_));
1236 }
1237
Compute(OpKernelContext * context)1238 void Compute(OpKernelContext* context) override {
1239 const Tensor& tensor_in = context->input(0);
1240
1241 std::vector<int32> ksize = ksize_;
1242 std::vector<int32> stride = stride_;
1243
1244 if (context->num_inputs() != 1) {
1245 const Tensor& tensor_ksize = context->input(1);
1246 auto value_ksize = tensor_ksize.flat<int32>();
1247 ksize.resize(tensor_ksize.shape().num_elements());
1248 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
1249
1250 const Tensor& tensor_stride = context->input(2);
1251 auto value_stride = tensor_stride.flat<int32>();
1252 stride.resize(tensor_stride.shape().num_elements());
1253 std::copy_n(&value_stride(0), stride.size(), stride.begin());
1254 }
1255 OP_REQUIRES(context, ksize.size() == 4,
1256 errors::InvalidArgument("Sliding window ksize field must "
1257 "specify 4 dimensions"));
1258 OP_REQUIRES(context, stride.size() == 4,
1259 errors::InvalidArgument("Sliding window stride field must "
1260 "specify 4 dimensions"));
1261 const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
1262 const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
1263 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
1264 errors::Unimplemented(
1265 "Pooling is not yet supported on the batch dimension."));
1266
1267 PoolParameters params{context, ksize, stride,
1268 padding_, data_format_, tensor_in.shape()};
1269 if (!context->status().ok()) {
1270 return;
1271 }
1272
1273 TensorShape out_shape =
1274 ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
1275 params.out_width, params.depth);
1276 if (use_dnn_ && data_format_ == FORMAT_NCHW) {
1277 DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
1278 stride, padding_, data_format_, tensor_in,
1279 out_shape, propagate_nans_);
1280 } else {
1281 CHECK(data_format_ == FORMAT_NHWC)
1282 << "Non-Cudnn MaxPool only supports NHWC format";
1283 Tensor* output = nullptr;
1284 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
1285 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
1286 output, propagate_nans_);
1287 }
1288 }
1289
1290 private:
1291 std::vector<int32> ksize_;
1292 std::vector<int32> stride_;
1293 Padding padding_;
1294 TensorFormat data_format_;
1295 bool use_dnn_;
1296 bool propagate_nans_;
1297 };
1298
1299 template <typename T>
1300 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingNoMask1301 static void launch(OpKernelContext* context, const PoolParameters& params,
1302 const Tensor& input, Tensor* output, bool propagate_nans) {
1303 bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1304 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1305 params.tensor_in_cols, params.depth, params.out_height,
1306 params.out_width, params.window_rows, params.window_cols,
1307 params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
1308 output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
1309 propagate_nans, false);
1310 if (!status) {
1311 context->SetStatus(
1312 errors::Internal("Failed launching MaxPoolForwardNoMask"));
1313 }
1314 }
1315 };
1316
1317 template <typename T>
1318 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingWithArgmax1319 static void launch(OpKernelContext* context, const PoolParameters& params,
1320 const Tensor& input, Tensor* output, Tensor* argmax,
1321 bool propagate_nans, bool include_batch_in_index) {
1322 bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
1323 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
1324 params.tensor_in_cols, params.depth, params.out_height,
1325 params.out_width, params.window_rows, params.window_cols,
1326 params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
1327 output->flat<T>().data(),
1328 reinterpret_cast<int64*>(argmax->flat<int64>().data()),
1329 context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
1330 if (!status) {
1331 context->SetStatus(
1332 errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
1333 }
1334 }
1335 };
1336
1337 template <typename T>
1338 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradWithArgmax1339 static void launch(OpKernelContext* context, const PoolParameters& params,
1340 const Tensor& grad_in, const Tensor& argmax,
1341 Tensor* grad_out, const bool include_batch_in_index) {
1342 const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1343 params.tensor_in_cols * params.depth;
1344 const int output_size = params.tensor_in_batch * params.out_height *
1345 params.out_width * params.depth;
1346 const int top_offset = params.out_height * params.out_width * params.depth;
1347 const int bottom_offset =
1348 params.tensor_in_rows * params.tensor_in_cols * params.depth;
1349 bool status = functor::MaxPoolBackwardWithArgmax<T>()(
1350 output_size, input_size, grad_in.flat<T>().data(),
1351 reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1352 bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1353 include_batch_in_index);
1354 if (!status) {
1355 context->SetStatus(
1356 errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
1357 }
1358 }
1359 };
1360
1361 template <typename T>
1362 struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
launchtensorflow::LaunchMaxPoolingGradGradWithArgmax1363 static void launch(OpKernelContext* context, const PoolParameters& params,
1364 const Tensor& grad_in, const Tensor& argmax,
1365 Tensor* grad_out, const bool include_batch_in_index) {
1366 const int input_size = params.tensor_in_batch * params.tensor_in_rows *
1367 params.tensor_in_cols * params.depth;
1368 const int output_size = params.tensor_in_batch * params.out_height *
1369 params.out_width * params.depth;
1370 const int top_offset =
1371 params.tensor_in_rows * params.tensor_in_cols * params.depth;
1372 const int bottom_offset =
1373 params.out_width * params.out_height * params.depth;
1374 bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
1375 output_size, input_size, grad_in.flat<T>().data(),
1376 reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
1377 bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
1378 include_batch_in_index);
1379 if (!status) {
1380 context->SetStatus(
1381 errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
1382 }
1383 }
1384 };
1385
1386 #endif // GOOGLE_CUDA
1387
1388 #define REGISTER_MAX_POOL_KERNELS(D, T) \
1389 REGISTER_KERNEL_BUILDER( \
1390 Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
1391 MaxPoolingGradOp<D##Device, T>); \
1392 REGISTER_KERNEL_BUILDER( \
1393 Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
1394 MaxPoolingGradGradOp<D##Device, T>); \
1395 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2") \
1396 .Device(DEVICE_##D) \
1397 .HostMemory("ksize") \
1398 .HostMemory("strides") \
1399 .TypeConstraint<T>("T"), \
1400 MaxPoolingGradOp<D##Device, T>); \
1401 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2") \
1402 .Device(DEVICE_##D) \
1403 .HostMemory("ksize") \
1404 .HostMemory("strides") \
1405 .TypeConstraint<T>("T"), \
1406 MaxPoolingGradGradOp<D##Device, T>) \
1407 REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") \
1408 .Device(DEVICE_##D) \
1409 .TypeConstraint<int64>("Targmax") \
1410 .TypeConstraint<T>("T"), \
1411 MaxPoolingWithArgmaxOp<D##Device, T>); \
1412 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") \
1413 .Device(DEVICE_##D) \
1414 .TypeConstraint<T>("T") \
1415 .TypeConstraint<int64>("Targmax"), \
1416 MaxPoolingGradWithArgmaxOp<D##Device, T>);
1417
1418 // Below kernels implemented only for CPU device.
1419 #define REGISTER_CPU_ONLY_POOL_KERNELS(T) \
1420 REGISTER_KERNEL_BUILDER( \
1421 Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
1422 MaxPoolingOp<CPUDevice, T>); \
1423 REGISTER_KERNEL_BUILDER( \
1424 Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
1425 MaxPoolingV2Op<CPUDevice, T>);
1426 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
1427 #undef REGISTER_CPU_ONLY_POOL_KERNELS
1428
1429 #define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
1430 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
1431 #undef REGISTER_CPU_KERNELS
1432
1433 #if GOOGLE_CUDA
1434
1435 // Forward declarations for the functor specializations for GPU.
1436 namespace functor {
1437 #define DECLARE_GPU_SPEC(T) \
1438 template <> \
1439 void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \
1440 const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
1441 typename TTypes<T, 4>::ConstTensor input, int window_rows, \
1442 int window_cols, int row_stride, int col_stride, \
1443 const Eigen::PaddingType& padding); \
1444 extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
1445
1446 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
1447 #undef DECLARE_GPU_SPEC
1448 } // namespace functor
1449
1450 #define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
1451 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
1452 #undef REGISTER_GPU_MAX_POOL_KERNELS
1453
1454 // Below kernels currently implemented only for GPU device.
1455 // Note(jiayq): Currently, the Caffe custom implementation is faster than the
1456 // default Eigen implementation so we are using the custom kernel as the
1457 // default. However, you can explicitly invoke the eigen version using
1458 // kernel_label_map.
1459 #define REGISTER_GPU_ONLY_POOL_KERNELS(T) \
1460 REGISTER_KERNEL_BUILDER(Name("MaxPool") \
1461 .Device(DEVICE_GPU) \
1462 .TypeConstraint<T>("T") \
1463 .Label("eigen_tensor"), \
1464 MaxPoolingOp<GPUDevice, T>); \
1465 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \
1466 .Device(DEVICE_GPU) \
1467 .HostMemory("ksize") \
1468 .HostMemory("strides") \
1469 .TypeConstraint<T>("T") \
1470 .Label("eigen_tensor"), \
1471 MaxPoolingV2Op<GPUDevice, T>); \
1472 REGISTER_KERNEL_BUILDER( \
1473 Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
1474 MaxPoolingNoMaskOp<GPUDevice, T>); \
1475 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \
1476 .Device(DEVICE_GPU) \
1477 .HostMemory("ksize") \
1478 .HostMemory("strides") \
1479 .TypeConstraint<T>("T"), \
1480 MaxPoolingNoMaskV2Op<GPUDevice, T>); \
1481 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax") \
1482 .Device(DEVICE_GPU) \
1483 .TypeConstraint<T>("T") \
1484 .TypeConstraint<int64>("Targmax"), \
1485 MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
1486 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
1487
1488 // TODO(b/65847473): Re-enable once the underlying build error is fixed.
1489 #if !defined(PLATFORM_WINDOWS)
1490 REGISTER_KERNEL_BUILDER(
1491 Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
1492 MaxPoolingNoMaskOp<GPUDevice, qint8>);
1493
1494 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1495 .Device(DEVICE_GPU)
1496 .HostMemory("ksize")
1497 .HostMemory("strides")
1498 .TypeConstraint<qint8>("T"),
1499 MaxPoolingV2Op<GPUDevice, qint8>);
1500
1501 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
1502 .Device(DEVICE_GPU)
1503 .HostMemory("ksize")
1504 .HostMemory("strides")
1505 .TypeConstraint<qint8>("T")
1506 .Label("eigen_tensor"),
1507 MaxPoolingV2Op<GPUDevice, qint8>);
1508 #endif // !defined(PLATFORM_WINDOWS)
1509
1510 #undef REGISTER_GPU_ONLY_POOL_KERNELS
1511
1512 #endif // GOOGLE_CUDA
1513
1514 #undef REGISTER_MAX_POOL_KERNELS
1515
1516 } // namespace tensorflow
1517