1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_ 17 #define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_ 18 19 #include <vector> 20 21 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 22 #define EIGEN_USE_GPU 23 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 24 25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 26 #include "tensorflow/core/framework/bounds_check.h" 27 #include "tensorflow/core/framework/numeric_op.h" 28 #include "tensorflow/core/framework/op_kernel.h" 29 #include "tensorflow/core/framework/tensor_shape.h" 30 #include "tensorflow/core/kernels/avgpooling_op.h" 31 #include "tensorflow/core/kernels/maxpooling_op.h" 32 #include "tensorflow/core/kernels/ops_util.h" 33 #include "tensorflow/core/util/padding.h" 34 #include "tensorflow/core/util/tensor_format.h" 35 #include "tensorflow/core/util/work_sharder.h" 36 37 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 38 #include "tensorflow/core/kernels/maxpooling_op_gpu.h" 39 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 40 41 namespace tensorflow { 42 43 typedef Eigen::GpuDevice GPUDevice; 44 45 // A helper class to manage sizes and shapes for pooling operations. 46 struct PoolParameters { 47 // Updates context->status if there is an invalid input. 48 // explicit_paddings has eight elements if padding==EXPLIICT, and zero 49 // elements otherwise. 50 PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize, 51 const std::vector<int32>& stride, Padding padding, 52 std::vector<int64> explicit_paddings, TensorFormat data_format, 53 const TensorShape& tensor_in_shape); 54 55 // Returns the shape of the output for "forward" pooling operations. 56 TensorShape forward_output_shape(); 57 58 int depth; 59 60 int tensor_in_cols; 61 int tensor_in_rows; 62 int tensor_in_batch; 63 64 int window_rows; 65 int window_cols; 66 int depth_window; 67 68 int row_stride; 69 int col_stride; 70 int depth_stride; 71 72 int64 out_height; 73 int64 out_width; 74 int out_depth; 75 76 int64 pad_top; 77 int64 pad_bottom; 78 int64 pad_left; 79 int64 pad_right; 80 81 int pad_depth; 82 83 TensorFormat data_format; 84 }; 85 86 // Checks if the sizes of the paddings are less than the size of window. 87 // This is required for MaxPool because it pads with -inf, so the pooling 88 // window cannot fully cover the padded area. 89 Status CheckPaddingSize(PoolParameters& params); 90 91 // An implementation of MaxPooling (forward). 92 // TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op, 93 // QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now 94 template <typename Device, typename T> 95 class MaxPoolingOp : public OpKernel { 96 public: MaxPoolingOp(OpKernelConstruction * context)97 explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) { 98 string data_format; 99 auto status = context->GetAttr("data_format", &data_format); 100 if (status.ok()) { 101 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 102 errors::InvalidArgument("Invalid data format")); 103 OP_REQUIRES( 104 context, data_format_ == FORMAT_NHWC, 105 errors::InvalidArgument("Default MaxPoolingOp only supports NHWC ", 106 "on device type ", 107 DeviceTypeString(context->device_type()))); 108 } else { 109 data_format_ = FORMAT_NHWC; 110 } 111 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 112 OP_REQUIRES(context, ksize_.size() == 4, 113 errors::InvalidArgument("Sliding window ksize field must " 114 "specify 4 dimensions")); 115 for (int i = 0; i < ksize_.size(); ++i) { 116 OP_REQUIRES(context, ksize_[i] > 0, 117 errors::InvalidArgument("Sliding window ksize for dimension ", 118 i, " was zero.")); 119 } 120 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 121 OP_REQUIRES(context, stride_.size() == 4, 122 errors::InvalidArgument("Sliding window stride field must " 123 "specify 4 dimensions")); 124 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 125 if (padding_ == Padding::EXPLICIT) { 126 OP_REQUIRES_OK( 127 context, context->GetAttr("explicit_paddings", &explicit_paddings_)); 128 } 129 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 130 errors::Unimplemented( 131 "Pooling is not yet supported on the batch dimension.")); 132 } 133 Compute(OpKernelContext * context)134 void Compute(OpKernelContext* context) override { 135 const Tensor& tensor_in = context->input(0); 136 PoolParameters params{ 137 context, ksize_, stride_, padding_, explicit_paddings_, 138 FORMAT_NHWC, tensor_in.shape()}; 139 if (!context->status().ok()) { 140 return; 141 } 142 143 Tensor* output = nullptr; 144 OP_REQUIRES_OK(context, context->allocate_output( 145 0, params.forward_output_shape(), &output)); 146 147 if (params.depth_window > 1) { 148 // Validate spec against the current implementation. A 149 // relaxation of these requirements would be ideal. 150 OP_REQUIRES(context, params.depth % params.depth_window == 0, 151 errors::Unimplemented( 152 "Depthwise max pooling requires " 153 "the depth window to evenly divide the input depth.")); 154 OP_REQUIRES( 155 context, params.depth_window == params.depth_stride, 156 errors::Unimplemented("Depthwise max pooling requires " 157 "the depth window to equal the depth stride.")); 158 OP_REQUIRES( 159 context, padding_ != EXPLICIT, 160 errors::Unimplemented("Depthwise max pooling does not support " 161 "explicit padding.")); 162 163 DepthwiseMaxPool(context, output, tensor_in, params); 164 } else { 165 // MaxPoolingOp is only called on the GPU when the eigen_tensor label 166 // is used. In this case, explicit padding is not supported 167 if (std::is_same<Device, GPUDevice>::value && 168 padding_ == Padding::EXPLICIT) { 169 context->SetStatus(errors::Unimplemented( 170 "MaxPoolingOp does not support explicit padding.")); 171 return; 172 } 173 SpatialMaxPool(context, output, tensor_in, params, padding_); 174 } 175 } 176 177 private: 178 // Single-threaded implementation of DepthwiseMaxPool which 179 // does not handle all of the same options as SpatialMaxPool 180 // (strict assumptions on no padding, stride). 181 // 182 // TODO(vrv): implement a more general depthwise-max pool that works 183 // on GPU as well. DepthwiseMaxPool(OpKernelContext * context,Tensor * output,const Tensor & tensor_in,const PoolParameters & params)184 void DepthwiseMaxPool(OpKernelContext* context, Tensor* output, 185 const Tensor& tensor_in, const PoolParameters& params) { 186 Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 187 in_by_pool(tensor_in.flat<T>().data(), params.depth_window, 188 tensor_in.NumElements() / params.depth_window); 189 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool( 190 output->flat<T>().data(), 1, output->NumElements()); 191 out_by_pool = in_by_pool.colwise().maxCoeff(); 192 } 193 SpatialMaxPool(OpKernelContext * context,Tensor * output,const Tensor & tensor_in,const PoolParameters & params,const Padding & padding)194 void SpatialMaxPool(OpKernelContext* context, Tensor* output, 195 const Tensor& tensor_in, const PoolParameters& params, 196 const Padding& padding) { 197 // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an 198 // EigenMatrix version that is currently faster than Eigen's 199 // Spatial MaxPooling implementation. 200 // 201 // TODO(vrv): Remove this once we no longer need it. 202 if (std::is_same<Device, GPUDevice>::value) { 203 Eigen::PaddingType pt = BrainPadding2EigenPadding(padding); 204 functor::SpatialMaxPooling<Device, T>()( 205 context->eigen_device<Device>(), output->tensor<T, 4>(), 206 tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols, 207 params.row_stride, params.col_stride, pt); 208 } else { 209 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 210 ConstEigenMatrixMap; 211 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 212 EigenMatrixMap; 213 214 ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth, 215 params.tensor_in_cols * params.tensor_in_rows * 216 params.tensor_in_batch); 217 EigenMatrixMap out_mat( 218 output->flat<T>().data(), params.depth, 219 params.out_width * params.out_height * params.tensor_in_batch); 220 221 const DeviceBase::CpuWorkerThreads& worker_threads = 222 *(context->device()->tensorflow_cpu_worker_threads()); 223 224 // The following code basically does the following: 225 // 1. Flattens the input and output tensors into two dimensional arrays. 226 // tensor_in_as_matrix: 227 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) 228 // output_as_matrix: 229 // depth by (out_width * out_height * tensor_in_batch) 230 // 231 // 2. Walks through the set of columns in the flattened 232 // tensor_in_as_matrix, 233 // and updates the corresponding column(s) in output_as_matrix with the 234 // max value. 235 auto shard = [¶ms, &in_mat, &out_mat](int64_t start, int64_t limit) { 236 const int32_t in_rows = params.tensor_in_rows; 237 const int32_t in_cols = params.tensor_in_cols; 238 const int32_t pad_top = params.pad_top; 239 const int32_t pad_left = params.pad_left; 240 const int32_t window_rows = params.window_rows; 241 const int32_t window_cols = params.window_cols; 242 const int32_t row_stride = params.row_stride; 243 const int32_t col_stride = params.col_stride; 244 const int32_t out_height = params.out_height; 245 const int32_t out_width = params.out_width; 246 247 { 248 // Initializes the output tensor with MIN<T>. 249 const int32_t output_image_size = 250 out_height * out_width * params.depth; 251 EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 252 1, (limit - start) * output_image_size); 253 out_shard.setConstant(Eigen::NumTraits<T>::lowest()); 254 } 255 256 for (int32_t b = start; b < limit; ++b) { 257 const int32_t out_offset_batch = b * out_height; 258 for (int32_t h = 0; h < in_rows; ++h) { 259 for (int32_t w = 0; w < in_cols; ++w) { 260 // (h_start, h_end) * (w_start, w_end) is the range that the input 261 // vector projects to. 262 const int32_t hpad = h + pad_top; 263 const int32_t wpad = w + pad_left; 264 const int32_t h_start = 265 (hpad < window_rows) ? 0 266 : (hpad - window_rows) / row_stride + 1; 267 const int32_t h_end = std::min(hpad / row_stride + 1, out_height); 268 const int32_t w_start = 269 (wpad < window_cols) ? 0 270 : (wpad - window_cols) / col_stride + 1; 271 const int32_t w_end = std::min(wpad / col_stride + 1, out_width); 272 // compute elementwise max 273 const int32_t in_offset = (b * in_rows + h) * in_cols + w; 274 for (int32_t ph = h_start; ph < h_end; ++ph) { 275 const int32_t out_offset_base = 276 (out_offset_batch + ph) * out_width; 277 for (int32_t pw = w_start; pw < w_end; ++pw) { 278 const int32_t out_offset = out_offset_base + pw; 279 out_mat.col(out_offset) = 280 out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset)); 281 } 282 } 283 } 284 } 285 } 286 }; 287 288 // TODO(andydavis) Consider sharding across batch x rows x cols. 289 // TODO(andydavis) Consider a higher resolution shard cost model. 290 const int64_t shard_cost = 291 params.tensor_in_rows * params.tensor_in_cols * params.depth; 292 Shard(worker_threads.num_threads, worker_threads.workers, 293 params.tensor_in_batch, shard_cost, shard); 294 } 295 } 296 297 std::vector<int32> ksize_; 298 std::vector<int32> stride_; 299 Padding padding_; 300 std::vector<int64> explicit_paddings_; 301 TensorFormat data_format_; 302 }; 303 304 template <typename Device> 305 struct LaunchMaxPoolingNoMask_NCHW_VECT_C; 306 307 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 308 template <> 309 struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> { 310 static void launch(OpKernelContext* context, const PoolParameters& params, 311 const Tensor& input, Tensor* output) { 312 #if GOOGLE_CUDA 313 bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()( 314 reinterpret_cast<const int32*>(input.flat<qint8>().data()), 315 params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols, 316 params.depth, params.out_height, params.out_width, params.window_rows, 317 params.window_cols, params.row_stride, params.col_stride, 318 params.pad_top, params.pad_left, 319 reinterpret_cast<int32*>(output->flat<qint8>().data()), 320 context->eigen_gpu_device()); 321 if (!status) { 322 context->SetStatus(errors::Internal( 323 "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C")); 324 } 325 #else 326 // ROCm TODO: add support __vmaxs4 on ROCm 327 context->SetStatus(errors::Internal( 328 "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C")); 329 #endif // GOOGLE_CUDA 330 } 331 }; 332 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 333 334 template <typename Device, typename T> 335 class MaxPoolingV2Op : public OpKernel { 336 public: 337 explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) { 338 string data_format; 339 auto status = context->GetAttr("data_format", &data_format); 340 if (status.ok()) { 341 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 342 errors::InvalidArgument("Invalid data format")); 343 OP_REQUIRES( 344 context, 345 data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW_VECT_C, 346 errors::InvalidArgument( 347 "MaxPoolingV2Op only supports NHWC or NCHW_VECT_C. Got: ", 348 data_format)); 349 } else { 350 data_format_ = FORMAT_NHWC; 351 } 352 if (context->num_inputs() == 1) { 353 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 354 OP_REQUIRES(context, ksize_.size() == 4, 355 errors::InvalidArgument("Sliding window ksize field must " 356 "specify 4 dimensions")); 357 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 358 OP_REQUIRES(context, stride_.size() == 4, 359 errors::InvalidArgument("Sliding window stride field must " 360 "specify 4 dimensions")); 361 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 362 errors::Unimplemented( 363 "Pooling is not yet supported on the batch dimension.")); 364 } 365 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 366 } 367 368 void Compute(OpKernelContext* context) override { 369 const Tensor& tensor_in = context->input(0); 370 371 std::vector<int32> ksize = ksize_; 372 std::vector<int32> stride = stride_; 373 374 if (context->num_inputs() != 1) { 375 const Tensor& tensor_ksize = context->input(1); 376 auto value_ksize = tensor_ksize.flat<int32>(); 377 ksize.resize(tensor_ksize.shape().num_elements()); 378 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); 379 380 const Tensor& tensor_stride = context->input(2); 381 auto value_stride = tensor_stride.flat<int32>(); 382 stride.resize(tensor_stride.shape().num_elements()); 383 std::copy_n(&value_stride(0), stride.size(), stride.begin()); 384 } 385 386 OP_REQUIRES(context, ksize.size() == 4, 387 errors::InvalidArgument("Sliding window ksize field must " 388 "specify 4 dimensions")); 389 OP_REQUIRES(context, stride.size() == 4, 390 errors::InvalidArgument("Sliding window stride field must " 391 "specify 4 dimensions")); 392 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, 393 errors::Unimplemented( 394 "Pooling is not yet supported on the batch dimension.")); 395 396 PoolParameters params{ 397 context, 398 ksize, 399 stride, 400 padding_, 401 /*explicit_paddings=*/{}, 402 data_format_, 403 tensor_in.shape(), 404 }; 405 if (!context->status().ok()) { 406 return; 407 } 408 409 Tensor* output = nullptr; 410 OP_REQUIRES_OK(context, context->allocate_output( 411 0, params.forward_output_shape(), &output)); 412 413 if (params.depth_window > 1) { 414 // Validate spec against the current implementation. A 415 // relaxation of these requirements would be ideal. 416 OP_REQUIRES(context, params.depth % params.depth_window == 0, 417 errors::Unimplemented( 418 "Depthwise max pooling requires " 419 "the depth window to evenly divide the input depth.")); 420 OP_REQUIRES( 421 context, params.depth_window == params.depth_stride, 422 errors::Unimplemented("Depthwise max pooling requires " 423 "the depth window to equal the depth stride.")); 424 425 DepthwiseMaxPool(context, output, tensor_in, params); 426 } else { 427 SpatialMaxPool(context, output, tensor_in, params, padding_); 428 } 429 } 430 431 private: 432 // Single-threaded implementation of DepthwiseMaxPool which 433 // does not handle all of the same options as SpatialMaxPool 434 // (strict assumptions on no padding, stride). 435 // 436 // TODO(vrv): implement a more general depthwise-max pool that works 437 // on GPU as well. 438 void DepthwiseMaxPool(OpKernelContext* context, Tensor* output, 439 const Tensor& tensor_in, const PoolParameters& params) { 440 Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 441 in_by_pool(tensor_in.flat<T>().data(), params.depth_window, 442 tensor_in.NumElements() / params.depth_window); 443 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool( 444 output->flat<T>().data(), 1, output->NumElements()); 445 out_by_pool = in_by_pool.colwise().maxCoeff(); 446 } 447 448 void SpatialMaxPool(OpKernelContext* context, Tensor* output, 449 const Tensor& tensor_in, const PoolParameters& params, 450 const Padding& padding) { 451 // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an 452 // EigenMatrix version that is currently faster than Eigen's 453 // Spatial MaxPooling implementation. 454 // 455 // TODO(vrv): Remove this once we no longer need it. 456 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 457 if (std::is_same<Device, GPUDevice>::value) { 458 Eigen::PaddingType pt = BrainPadding2EigenPadding(padding); 459 if (std::is_same<T, qint8>::value) { 460 LaunchMaxPoolingNoMask_NCHW_VECT_C<GPUDevice>::launch( 461 context, params, tensor_in, output); 462 } else { 463 functor::SpatialMaxPooling<Device, T>()( 464 context->eigen_device<Device>(), output->tensor<T, 4>(), 465 tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols, 466 params.row_stride, params.col_stride, pt); 467 } 468 } else 469 #endif 470 { 471 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 472 ConstEigenMatrixMap; 473 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 474 EigenMatrixMap; 475 476 ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth, 477 params.tensor_in_cols * params.tensor_in_rows * 478 params.tensor_in_batch); 479 EigenMatrixMap out_mat( 480 output->flat<T>().data(), params.depth, 481 params.out_width * params.out_height * params.tensor_in_batch); 482 483 const DeviceBase::CpuWorkerThreads& worker_threads = 484 *(context->device()->tensorflow_cpu_worker_threads()); 485 486 // The following code basically does the following: 487 // 1. Flattens the input and output tensors into two dimensional arrays. 488 // tensor_in_as_matrix: 489 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) 490 // output_as_matrix: 491 // depth by (out_width * out_height * tensor_in_batch) 492 // 493 // 2. Walks through the set of columns in the flattened 494 // tensor_in_as_matrix, 495 // and updates the corresponding column(s) in output_as_matrix with the 496 // max value. 497 auto shard = [¶ms, &in_mat, &out_mat](int64_t start, int64_t limit) { 498 const int32_t in_rows = params.tensor_in_rows; 499 const int32_t in_cols = params.tensor_in_cols; 500 const int32_t pad_top = params.pad_top; 501 const int32_t pad_left = params.pad_left; 502 const int32_t window_rows = params.window_rows; 503 const int32_t window_cols = params.window_cols; 504 const int32_t row_stride = params.row_stride; 505 const int32_t col_stride = params.col_stride; 506 const int32_t out_height = params.out_height; 507 const int32_t out_width = params.out_width; 508 509 { 510 // Initializes the output tensor with MIN<T>. 511 const int32_t output_image_size = 512 out_height * out_width * params.depth; 513 EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 514 1, (limit - start) * output_image_size); 515 out_shard.setConstant(Eigen::NumTraits<T>::lowest()); 516 } 517 518 for (int32_t b = start; b < limit; ++b) { 519 const int32_t out_offset_batch = b * out_height; 520 for (int32_t h = 0; h < in_rows; ++h) { 521 for (int32_t w = 0; w < in_cols; ++w) { 522 // (h_start, h_end) * (w_start, w_end) is the range that the input 523 // vector projects to. 524 const int32_t hpad = h + pad_top; 525 const int32_t wpad = w + pad_left; 526 const int32_t h_start = 527 (hpad < window_rows) ? 0 528 : (hpad - window_rows) / row_stride + 1; 529 const int32_t h_end = std::min(hpad / row_stride + 1, out_height); 530 const int32_t w_start = 531 (wpad < window_cols) ? 0 532 : (wpad - window_cols) / col_stride + 1; 533 const int32_t w_end = std::min(wpad / col_stride + 1, out_width); 534 // compute elementwise max 535 const int32_t in_offset = (b * in_rows + h) * in_cols + w; 536 for (int32_t ph = h_start; ph < h_end; ++ph) { 537 const int32_t out_offset_base = 538 (out_offset_batch + ph) * out_width; 539 for (int32_t pw = w_start; pw < w_end; ++pw) { 540 const int32_t out_offset = out_offset_base + pw; 541 out_mat.col(out_offset) = 542 out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset)); 543 } 544 } 545 } 546 } 547 } 548 }; 549 550 // TODO(andydavis) Consider sharding across batch x rows x cols. 551 // TODO(andydavis) Consider a higher resolution shard cost model. 552 const int64_t shard_cost = 553 params.tensor_in_rows * params.tensor_in_cols * params.depth; 554 Shard(worker_threads.num_threads, worker_threads.workers, 555 params.tensor_in_batch, shard_cost, shard); 556 } 557 } 558 559 std::vector<int32> ksize_; 560 std::vector<int32> stride_; 561 Padding padding_; 562 TensorFormat data_format_; 563 }; 564 565 template <typename Device, typename T> 566 void SpatialAvgPool(OpKernelContext* context, Tensor* output, 567 const Tensor& input, const PoolParameters& params, 568 const Padding& padding) { 569 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 570 ConstEigenMatrixMap; 571 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 572 EigenMatrixMap; 573 574 auto in_flat = input.flat<T>(); 575 auto out_flat = output->flat<T>(); 576 577 auto shard = [¶ms, &in_flat, &out_flat](int64_t start, int64_t limit) { 578 // Calculate indices for this shards chunk of work. 579 const int64_t input_image_size = 580 params.tensor_in_rows * params.tensor_in_cols * params.depth; 581 const int64_t output_image_size = 582 params.out_width * params.out_height * params.depth; 583 const int64_t shard_batch_size = limit - start; 584 585 ConstEigenMatrixMap in_mat( 586 in_flat.data() + start * input_image_size, params.depth, 587 params.tensor_in_cols * params.tensor_in_rows * shard_batch_size); 588 EigenMatrixMap out_mat( 589 out_flat.data() + start * output_image_size, params.depth, 590 params.out_width * params.out_height * shard_batch_size); 591 Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols()); 592 out_count.setZero(); 593 594 // Initializes output to zero. 595 out_mat.setZero(); 596 597 // The following code basically does the following: 598 // 1. Flattens the input and output tensors into two dimensional arrays. 599 // tensor_in_as_matrix: 600 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) 601 // output_as_matrix: 602 // depth by (out_width * out_height * tensor_in_batch) 603 // 604 // 2. Walks through the set of columns in the flattened 605 // tensor_in_as_matrix, 606 // and updates the corresponding column(s) in output_as_matrix with the 607 // average value. 608 for (int b = 0; b < shard_batch_size; ++b) { 609 for (int h = 0; h < params.tensor_in_rows; ++h) { 610 for (int w = 0; w < params.tensor_in_cols; ++w) { 611 // (h_start, h_end) * (w_start, w_end) is the range that the input 612 // vector projects to. 613 const int hpad = h + params.pad_top; 614 const int wpad = w + params.pad_left; 615 const int h_start = 616 (hpad < params.window_rows) 617 ? 0 618 : (hpad - params.window_rows) / params.row_stride + 1; 619 const int h_end = 620 std::min<int>(hpad / params.row_stride + 1, params.out_height); 621 const int w_start = 622 (wpad < params.window_cols) 623 ? 0 624 : (wpad - params.window_cols) / params.col_stride + 1; 625 const int w_end = 626 std::min<int>(wpad / params.col_stride + 1, params.out_width); 627 const int in_offset = 628 (b * params.tensor_in_rows + h) * params.tensor_in_cols + w; 629 Eigen::DSizes<Eigen::DenseIndex, 2> in_indices(0, in_offset); 630 for (int ph = h_start; ph < h_end; ++ph) { 631 for (int pw = w_start; pw < w_end; ++pw) { 632 const int out_offset = 633 (b * params.out_height + ph) * params.out_width + pw; 634 out_mat.col(out_offset) += in_mat.col(in_offset); 635 out_count(out_offset) += T(1); 636 } 637 } 638 } 639 } 640 } 641 642 DCHECK_GT(out_count.minCoeff(), T(0)); 643 out_mat.array().rowwise() /= out_count.transpose().array(); 644 }; 645 646 const int64_t work_unit_size = 647 params.tensor_in_rows * params.tensor_in_cols * params.depth; 648 // NOTE: Constants in calculation below were estimated based on benchmarking. 649 // Nanoseconds/work_unit for benchmarks ranged from 0.01 to 0.001, and 650 // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit 651 // the work unit cost to an operating range in which it empirically performed 652 // best. 653 const int64_t work_unit_cost = std::max(int64{10000}, work_unit_size / 100); 654 const DeviceBase::CpuWorkerThreads& worker_threads = 655 *(context->device()->tensorflow_cpu_worker_threads()); 656 Shard(worker_threads.num_threads, worker_threads.workers, 657 params.tensor_in_batch, work_unit_cost, shard); 658 } 659 660 } // namespace tensorflow 661 662 #endif // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_ 663