• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
17 #define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
18 
19 #include <vector>
20 
21 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
22 #define EIGEN_USE_GPU
23 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
24 
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/framework/bounds_check.h"
27 #include "tensorflow/core/framework/numeric_op.h"
28 #include "tensorflow/core/framework/op_kernel.h"
29 #include "tensorflow/core/framework/tensor_shape.h"
30 #include "tensorflow/core/kernels/avgpooling_op.h"
31 #include "tensorflow/core/kernels/maxpooling_op.h"
32 #include "tensorflow/core/kernels/ops_util.h"
33 #include "tensorflow/core/util/padding.h"
34 #include "tensorflow/core/util/tensor_format.h"
35 #include "tensorflow/core/util/work_sharder.h"
36 
37 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
39 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
40 
41 namespace tensorflow {
42 
43 typedef Eigen::GpuDevice GPUDevice;
44 
45 // A helper class to manage sizes and shapes for pooling operations.
46 struct PoolParameters {
47   // Updates context->status if there is an invalid input.
48   // explicit_paddings has eight elements if padding==EXPLIICT, and zero
49   // elements otherwise.
50   PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
51                  const std::vector<int32>& stride, Padding padding,
52                  std::vector<int64> explicit_paddings, TensorFormat data_format,
53                  const TensorShape& tensor_in_shape);
54 
55   // Returns the shape of the output for "forward" pooling operations.
56   TensorShape forward_output_shape();
57 
58   int depth;
59 
60   int tensor_in_cols;
61   int tensor_in_rows;
62   int tensor_in_batch;
63 
64   int window_rows;
65   int window_cols;
66   int depth_window;
67 
68   int row_stride;
69   int col_stride;
70   int depth_stride;
71 
72   int64 out_height;
73   int64 out_width;
74   int out_depth;
75 
76   int64 pad_top;
77   int64 pad_bottom;
78   int64 pad_left;
79   int64 pad_right;
80 
81   int pad_depth;
82 
83   TensorFormat data_format;
84 };
85 
86 // Checks if the sizes of the paddings are less than the size of window.
87 // This is required for MaxPool because it pads with -inf, so the pooling
88 // window cannot fully cover the padded area.
89 Status CheckPaddingSize(PoolParameters& params);
90 
91 // An implementation of MaxPooling (forward).
92 // TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
93 //     QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
94 template <typename Device, typename T>
95 class MaxPoolingOp : public OpKernel {
96  public:
MaxPoolingOp(OpKernelConstruction * context)97   explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
98     string data_format;
99     auto status = context->GetAttr("data_format", &data_format);
100     if (status.ok()) {
101       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
102                   errors::InvalidArgument("Invalid data format"));
103       OP_REQUIRES(
104           context, data_format_ == FORMAT_NHWC,
105           errors::InvalidArgument("Default MaxPoolingOp only supports NHWC ",
106                                   "on device type ",
107                                   DeviceTypeString(context->device_type())));
108     } else {
109       data_format_ = FORMAT_NHWC;
110     }
111     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
112     OP_REQUIRES(context, ksize_.size() == 4,
113                 errors::InvalidArgument("Sliding window ksize field must "
114                                         "specify 4 dimensions"));
115     for (int i = 0; i < ksize_.size(); ++i) {
116       OP_REQUIRES(context, ksize_[i] > 0,
117                   errors::InvalidArgument("Sliding window ksize for dimension ",
118                                           i, " was zero."));
119     }
120     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
121     OP_REQUIRES(context, stride_.size() == 4,
122                 errors::InvalidArgument("Sliding window stride field must "
123                                         "specify 4 dimensions"));
124     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
125     if (padding_ == Padding::EXPLICIT) {
126       OP_REQUIRES_OK(
127           context, context->GetAttr("explicit_paddings", &explicit_paddings_));
128     }
129     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
130                 errors::Unimplemented(
131                     "Pooling is not yet supported on the batch dimension."));
132   }
133 
Compute(OpKernelContext * context)134   void Compute(OpKernelContext* context) override {
135     const Tensor& tensor_in = context->input(0);
136     PoolParameters params{
137         context,     ksize_,           stride_, padding_, explicit_paddings_,
138         FORMAT_NHWC, tensor_in.shape()};
139     if (!context->status().ok()) {
140       return;
141     }
142 
143     Tensor* output = nullptr;
144     OP_REQUIRES_OK(context, context->allocate_output(
145                                 0, params.forward_output_shape(), &output));
146 
147     if (params.depth_window > 1) {
148       // Validate spec against the current implementation.  A
149       // relaxation of these requirements would be ideal.
150       OP_REQUIRES(context, params.depth % params.depth_window == 0,
151                   errors::Unimplemented(
152                       "Depthwise max pooling requires "
153                       "the depth window to evenly divide the input depth."));
154       OP_REQUIRES(
155           context, params.depth_window == params.depth_stride,
156           errors::Unimplemented("Depthwise max pooling requires "
157                                 "the depth window to equal the depth stride."));
158       OP_REQUIRES(
159           context, padding_ != EXPLICIT,
160           errors::Unimplemented("Depthwise max pooling does not support "
161                                 "explicit padding."));
162 
163       DepthwiseMaxPool(context, output, tensor_in, params);
164     } else {
165       // MaxPoolingOp is only called on the GPU when the eigen_tensor label
166       // is used. In this case, explicit padding is not supported
167       if (std::is_same<Device, GPUDevice>::value &&
168           padding_ == Padding::EXPLICIT) {
169         context->SetStatus(errors::Unimplemented(
170             "MaxPoolingOp does not support explicit padding."));
171         return;
172       }
173       SpatialMaxPool(context, output, tensor_in, params, padding_);
174     }
175   }
176 
177  private:
178   // Single-threaded implementation of DepthwiseMaxPool which
179   // does not handle all of the same options as SpatialMaxPool
180   // (strict assumptions on no padding, stride).
181   //
182   // TODO(vrv): implement a more general depthwise-max pool that works
183   // on GPU as well.
DepthwiseMaxPool(OpKernelContext * context,Tensor * output,const Tensor & tensor_in,const PoolParameters & params)184   void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
185                         const Tensor& tensor_in, const PoolParameters& params) {
186     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
187         in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
188                    tensor_in.NumElements() / params.depth_window);
189     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
190         output->flat<T>().data(), 1, output->NumElements());
191     out_by_pool = in_by_pool.colwise().maxCoeff();
192   }
193 
SpatialMaxPool(OpKernelContext * context,Tensor * output,const Tensor & tensor_in,const PoolParameters & params,const Padding & padding)194   void SpatialMaxPool(OpKernelContext* context, Tensor* output,
195                       const Tensor& tensor_in, const PoolParameters& params,
196                       const Padding& padding) {
197     // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
198     // EigenMatrix version that is currently faster than Eigen's
199     // Spatial MaxPooling implementation.
200     //
201     // TODO(vrv): Remove this once we no longer need it.
202     if (std::is_same<Device, GPUDevice>::value) {
203       Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
204       functor::SpatialMaxPooling<Device, T>()(
205           context->eigen_device<Device>(), output->tensor<T, 4>(),
206           tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
207           params.row_stride, params.col_stride, pt);
208     } else {
209       typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
210           ConstEigenMatrixMap;
211       typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
212           EigenMatrixMap;
213 
214       ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
215                                  params.tensor_in_cols * params.tensor_in_rows *
216                                      params.tensor_in_batch);
217       EigenMatrixMap out_mat(
218           output->flat<T>().data(), params.depth,
219           params.out_width * params.out_height * params.tensor_in_batch);
220 
221       const DeviceBase::CpuWorkerThreads& worker_threads =
222           *(context->device()->tensorflow_cpu_worker_threads());
223 
224       // The following code basically does the following:
225       // 1. Flattens the input and output tensors into two dimensional arrays.
226       //    tensor_in_as_matrix:
227       //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
228       //    output_as_matrix:
229       //      depth by (out_width * out_height * tensor_in_batch)
230       //
231       // 2. Walks through the set of columns in the flattened
232       // tensor_in_as_matrix,
233       //    and updates the corresponding column(s) in output_as_matrix with the
234       //    max value.
235       auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
236         const int32_t in_rows = params.tensor_in_rows;
237         const int32_t in_cols = params.tensor_in_cols;
238         const int32_t pad_top = params.pad_top;
239         const int32_t pad_left = params.pad_left;
240         const int32_t window_rows = params.window_rows;
241         const int32_t window_cols = params.window_cols;
242         const int32_t row_stride = params.row_stride;
243         const int32_t col_stride = params.col_stride;
244         const int32_t out_height = params.out_height;
245         const int32_t out_width = params.out_width;
246 
247         {
248           // Initializes the output tensor with MIN<T>.
249           const int32_t output_image_size =
250               out_height * out_width * params.depth;
251           EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
252                                    1, (limit - start) * output_image_size);
253           out_shard.setConstant(Eigen::NumTraits<T>::lowest());
254         }
255 
256         for (int32_t b = start; b < limit; ++b) {
257           const int32_t out_offset_batch = b * out_height;
258           for (int32_t h = 0; h < in_rows; ++h) {
259             for (int32_t w = 0; w < in_cols; ++w) {
260               // (h_start, h_end) * (w_start, w_end) is the range that the input
261               // vector projects to.
262               const int32_t hpad = h + pad_top;
263               const int32_t wpad = w + pad_left;
264               const int32_t h_start =
265                   (hpad < window_rows) ? 0
266                                        : (hpad - window_rows) / row_stride + 1;
267               const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
268               const int32_t w_start =
269                   (wpad < window_cols) ? 0
270                                        : (wpad - window_cols) / col_stride + 1;
271               const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
272               // compute elementwise max
273               const int32_t in_offset = (b * in_rows + h) * in_cols + w;
274               for (int32_t ph = h_start; ph < h_end; ++ph) {
275                 const int32_t out_offset_base =
276                     (out_offset_batch + ph) * out_width;
277                 for (int32_t pw = w_start; pw < w_end; ++pw) {
278                   const int32_t out_offset = out_offset_base + pw;
279                   out_mat.col(out_offset) =
280                       out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
281                 }
282               }
283             }
284           }
285         }
286       };
287 
288       // TODO(andydavis) Consider sharding across batch x rows x cols.
289       // TODO(andydavis) Consider a higher resolution shard cost model.
290       const int64_t shard_cost =
291           params.tensor_in_rows * params.tensor_in_cols * params.depth;
292       Shard(worker_threads.num_threads, worker_threads.workers,
293             params.tensor_in_batch, shard_cost, shard);
294     }
295   }
296 
297   std::vector<int32> ksize_;
298   std::vector<int32> stride_;
299   Padding padding_;
300   std::vector<int64> explicit_paddings_;
301   TensorFormat data_format_;
302 };
303 
304 template <typename Device>
305 struct LaunchMaxPoolingNoMask_NCHW_VECT_C;
306 
307 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
308 template <>
309 struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
310   static void launch(OpKernelContext* context, const PoolParameters& params,
311                      const Tensor& input, Tensor* output) {
312 #if GOOGLE_CUDA
313     bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
314         reinterpret_cast<const int32*>(input.flat<qint8>().data()),
315         params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
316         params.depth, params.out_height, params.out_width, params.window_rows,
317         params.window_cols, params.row_stride, params.col_stride,
318         params.pad_top, params.pad_left,
319         reinterpret_cast<int32*>(output->flat<qint8>().data()),
320         context->eigen_gpu_device());
321     if (!status) {
322       context->SetStatus(errors::Internal(
323           "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
324     }
325 #else
326     // ROCm TODO: add support __vmaxs4 on ROCm
327     context->SetStatus(errors::Internal(
328         "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
329 #endif  // GOOGLE_CUDA
330   }
331 };
332 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
333 
334 template <typename Device, typename T>
335 class MaxPoolingV2Op : public OpKernel {
336  public:
337   explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
338     string data_format;
339     auto status = context->GetAttr("data_format", &data_format);
340     if (status.ok()) {
341       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
342                   errors::InvalidArgument("Invalid data format"));
343       OP_REQUIRES(
344           context,
345           data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW_VECT_C,
346           errors::InvalidArgument(
347               "MaxPoolingV2Op only supports NHWC or NCHW_VECT_C. Got: ",
348               data_format));
349     } else {
350       data_format_ = FORMAT_NHWC;
351     }
352     if (context->num_inputs() == 1) {
353       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
354       OP_REQUIRES(context, ksize_.size() == 4,
355                   errors::InvalidArgument("Sliding window ksize field must "
356                                           "specify 4 dimensions"));
357       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
358       OP_REQUIRES(context, stride_.size() == 4,
359                   errors::InvalidArgument("Sliding window stride field must "
360                                           "specify 4 dimensions"));
361       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
362                   errors::Unimplemented(
363                       "Pooling is not yet supported on the batch dimension."));
364     }
365     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
366   }
367 
368   void Compute(OpKernelContext* context) override {
369     const Tensor& tensor_in = context->input(0);
370 
371     std::vector<int32> ksize = ksize_;
372     std::vector<int32> stride = stride_;
373 
374     if (context->num_inputs() != 1) {
375       const Tensor& tensor_ksize = context->input(1);
376       auto value_ksize = tensor_ksize.flat<int32>();
377       ksize.resize(tensor_ksize.shape().num_elements());
378       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
379 
380       const Tensor& tensor_stride = context->input(2);
381       auto value_stride = tensor_stride.flat<int32>();
382       stride.resize(tensor_stride.shape().num_elements());
383       std::copy_n(&value_stride(0), stride.size(), stride.begin());
384     }
385 
386     OP_REQUIRES(context, ksize.size() == 4,
387                 errors::InvalidArgument("Sliding window ksize field must "
388                                         "specify 4 dimensions"));
389     OP_REQUIRES(context, stride.size() == 4,
390                 errors::InvalidArgument("Sliding window stride field must "
391                                         "specify 4 dimensions"));
392     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
393                 errors::Unimplemented(
394                     "Pooling is not yet supported on the batch dimension."));
395 
396     PoolParameters params{
397         context,
398         ksize,
399         stride,
400         padding_,
401         /*explicit_paddings=*/{},
402         data_format_,
403         tensor_in.shape(),
404     };
405     if (!context->status().ok()) {
406       return;
407     }
408 
409     Tensor* output = nullptr;
410     OP_REQUIRES_OK(context, context->allocate_output(
411                                 0, params.forward_output_shape(), &output));
412 
413     if (params.depth_window > 1) {
414       // Validate spec against the current implementation.  A
415       // relaxation of these requirements would be ideal.
416       OP_REQUIRES(context, params.depth % params.depth_window == 0,
417                   errors::Unimplemented(
418                       "Depthwise max pooling requires "
419                       "the depth window to evenly divide the input depth."));
420       OP_REQUIRES(
421           context, params.depth_window == params.depth_stride,
422           errors::Unimplemented("Depthwise max pooling requires "
423                                 "the depth window to equal the depth stride."));
424 
425       DepthwiseMaxPool(context, output, tensor_in, params);
426     } else {
427       SpatialMaxPool(context, output, tensor_in, params, padding_);
428     }
429   }
430 
431  private:
432   // Single-threaded implementation of DepthwiseMaxPool which
433   // does not handle all of the same options as SpatialMaxPool
434   // (strict assumptions on no padding, stride).
435   //
436   // TODO(vrv): implement a more general depthwise-max pool that works
437   // on GPU as well.
438   void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
439                         const Tensor& tensor_in, const PoolParameters& params) {
440     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
441         in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
442                    tensor_in.NumElements() / params.depth_window);
443     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
444         output->flat<T>().data(), 1, output->NumElements());
445     out_by_pool = in_by_pool.colwise().maxCoeff();
446   }
447 
448   void SpatialMaxPool(OpKernelContext* context, Tensor* output,
449                       const Tensor& tensor_in, const PoolParameters& params,
450                       const Padding& padding) {
451     // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
452     // EigenMatrix version that is currently faster than Eigen's
453     // Spatial MaxPooling implementation.
454     //
455     // TODO(vrv): Remove this once we no longer need it.
456 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
457     if (std::is_same<Device, GPUDevice>::value) {
458       Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
459       if (std::is_same<T, qint8>::value) {
460         LaunchMaxPoolingNoMask_NCHW_VECT_C<GPUDevice>::launch(
461             context, params, tensor_in, output);
462       } else {
463         functor::SpatialMaxPooling<Device, T>()(
464             context->eigen_device<Device>(), output->tensor<T, 4>(),
465             tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
466             params.row_stride, params.col_stride, pt);
467       }
468     } else
469 #endif
470     {
471       typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
472           ConstEigenMatrixMap;
473       typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
474           EigenMatrixMap;
475 
476       ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
477                                  params.tensor_in_cols * params.tensor_in_rows *
478                                      params.tensor_in_batch);
479       EigenMatrixMap out_mat(
480           output->flat<T>().data(), params.depth,
481           params.out_width * params.out_height * params.tensor_in_batch);
482 
483       const DeviceBase::CpuWorkerThreads& worker_threads =
484           *(context->device()->tensorflow_cpu_worker_threads());
485 
486       // The following code basically does the following:
487       // 1. Flattens the input and output tensors into two dimensional arrays.
488       //    tensor_in_as_matrix:
489       //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
490       //    output_as_matrix:
491       //      depth by (out_width * out_height * tensor_in_batch)
492       //
493       // 2. Walks through the set of columns in the flattened
494       // tensor_in_as_matrix,
495       //    and updates the corresponding column(s) in output_as_matrix with the
496       //    max value.
497       auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
498         const int32_t in_rows = params.tensor_in_rows;
499         const int32_t in_cols = params.tensor_in_cols;
500         const int32_t pad_top = params.pad_top;
501         const int32_t pad_left = params.pad_left;
502         const int32_t window_rows = params.window_rows;
503         const int32_t window_cols = params.window_cols;
504         const int32_t row_stride = params.row_stride;
505         const int32_t col_stride = params.col_stride;
506         const int32_t out_height = params.out_height;
507         const int32_t out_width = params.out_width;
508 
509         {
510           // Initializes the output tensor with MIN<T>.
511           const int32_t output_image_size =
512               out_height * out_width * params.depth;
513           EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
514                                    1, (limit - start) * output_image_size);
515           out_shard.setConstant(Eigen::NumTraits<T>::lowest());
516         }
517 
518         for (int32_t b = start; b < limit; ++b) {
519           const int32_t out_offset_batch = b * out_height;
520           for (int32_t h = 0; h < in_rows; ++h) {
521             for (int32_t w = 0; w < in_cols; ++w) {
522               // (h_start, h_end) * (w_start, w_end) is the range that the input
523               // vector projects to.
524               const int32_t hpad = h + pad_top;
525               const int32_t wpad = w + pad_left;
526               const int32_t h_start =
527                   (hpad < window_rows) ? 0
528                                        : (hpad - window_rows) / row_stride + 1;
529               const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
530               const int32_t w_start =
531                   (wpad < window_cols) ? 0
532                                        : (wpad - window_cols) / col_stride + 1;
533               const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
534               // compute elementwise max
535               const int32_t in_offset = (b * in_rows + h) * in_cols + w;
536               for (int32_t ph = h_start; ph < h_end; ++ph) {
537                 const int32_t out_offset_base =
538                     (out_offset_batch + ph) * out_width;
539                 for (int32_t pw = w_start; pw < w_end; ++pw) {
540                   const int32_t out_offset = out_offset_base + pw;
541                   out_mat.col(out_offset) =
542                       out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
543                 }
544               }
545             }
546           }
547         }
548       };
549 
550       // TODO(andydavis) Consider sharding across batch x rows x cols.
551       // TODO(andydavis) Consider a higher resolution shard cost model.
552       const int64_t shard_cost =
553           params.tensor_in_rows * params.tensor_in_cols * params.depth;
554       Shard(worker_threads.num_threads, worker_threads.workers,
555             params.tensor_in_batch, shard_cost, shard);
556     }
557   }
558 
559   std::vector<int32> ksize_;
560   std::vector<int32> stride_;
561   Padding padding_;
562   TensorFormat data_format_;
563 };
564 
565 template <typename Device, typename T>
566 void SpatialAvgPool(OpKernelContext* context, Tensor* output,
567                     const Tensor& input, const PoolParameters& params,
568                     const Padding& padding) {
569   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
570       ConstEigenMatrixMap;
571   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
572       EigenMatrixMap;
573 
574   auto in_flat = input.flat<T>();
575   auto out_flat = output->flat<T>();
576 
577   auto shard = [&params, &in_flat, &out_flat](int64_t start, int64_t limit) {
578     // Calculate indices for this shards chunk of work.
579     const int64_t input_image_size =
580         params.tensor_in_rows * params.tensor_in_cols * params.depth;
581     const int64_t output_image_size =
582         params.out_width * params.out_height * params.depth;
583     const int64_t shard_batch_size = limit - start;
584 
585     ConstEigenMatrixMap in_mat(
586         in_flat.data() + start * input_image_size, params.depth,
587         params.tensor_in_cols * params.tensor_in_rows * shard_batch_size);
588     EigenMatrixMap out_mat(
589         out_flat.data() + start * output_image_size, params.depth,
590         params.out_width * params.out_height * shard_batch_size);
591     Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols());
592     out_count.setZero();
593 
594     // Initializes output to zero.
595     out_mat.setZero();
596 
597     // The following code basically does the following:
598     // 1. Flattens the input and output tensors into two dimensional arrays.
599     //    tensor_in_as_matrix:
600     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
601     //    output_as_matrix:
602     //      depth by (out_width * out_height * tensor_in_batch)
603     //
604     // 2. Walks through the set of columns in the flattened
605     // tensor_in_as_matrix,
606     //    and updates the corresponding column(s) in output_as_matrix with the
607     //    average value.
608     for (int b = 0; b < shard_batch_size; ++b) {
609       for (int h = 0; h < params.tensor_in_rows; ++h) {
610         for (int w = 0; w < params.tensor_in_cols; ++w) {
611           // (h_start, h_end) * (w_start, w_end) is the range that the input
612           // vector projects to.
613           const int hpad = h + params.pad_top;
614           const int wpad = w + params.pad_left;
615           const int h_start =
616               (hpad < params.window_rows)
617                   ? 0
618                   : (hpad - params.window_rows) / params.row_stride + 1;
619           const int h_end =
620               std::min<int>(hpad / params.row_stride + 1, params.out_height);
621           const int w_start =
622               (wpad < params.window_cols)
623                   ? 0
624                   : (wpad - params.window_cols) / params.col_stride + 1;
625           const int w_end =
626               std::min<int>(wpad / params.col_stride + 1, params.out_width);
627           const int in_offset =
628               (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
629           Eigen::DSizes<Eigen::DenseIndex, 2> in_indices(0, in_offset);
630           for (int ph = h_start; ph < h_end; ++ph) {
631             for (int pw = w_start; pw < w_end; ++pw) {
632               const int out_offset =
633                   (b * params.out_height + ph) * params.out_width + pw;
634               out_mat.col(out_offset) += in_mat.col(in_offset);
635               out_count(out_offset) += T(1);
636             }
637           }
638         }
639       }
640     }
641 
642     DCHECK_GT(out_count.minCoeff(), T(0));
643     out_mat.array().rowwise() /= out_count.transpose().array();
644   };
645 
646   const int64_t work_unit_size =
647       params.tensor_in_rows * params.tensor_in_cols * params.depth;
648   // NOTE: Constants in calculation below were estimated based on benchmarking.
649   // Nanoseconds/work_unit for benchmarks ranged from 0.01 to 0.001, and
650   // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit
651   // the work unit cost to an operating range in which it empirically performed
652   // best.
653   const int64_t work_unit_cost = std::max(int64{10000}, work_unit_size / 100);
654   const DeviceBase::CpuWorkerThreads& worker_threads =
655       *(context->device()->tensorflow_cpu_worker_threads());
656   Shard(worker_threads.num_threads, worker_threads.workers,
657         params.tensor_in_batch, work_unit_cost, shard);
658 }
659 
660 }  // namespace tensorflow
661 
662 #endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
663