• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/kernels/pooling_ops_common.h"
17 
18 #include <vector>
19 #include "tensorflow/core/common_runtime/device.h"
20 #include "tensorflow/core/framework/register_types.h"
21 #include "tensorflow/core/framework/tensor.h"
22 
23 #if GOOGLE_CUDA
24 #include "cuda/include/cudnn.h"
25 #include "tensorflow/core/kernels/conv_2d.h"
26 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
27 #include "tensorflow/core/platform/stream_executor.h"
28 #endif  // GOOGLE_CUDA
29 
30 namespace tensorflow {
31 
32 namespace {
33 
34 template <typename T>
35 struct RawType {
36   using type = T;
37 };
38 
39 template <>
40 struct RawType<qint8> {
41   using type = int8;
42 };
43 
44 }  // namespace
45 
PoolParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const TensorShape & tensor_in_shape)46 PoolParameters::PoolParameters(OpKernelContext* context,
47                                const std::vector<int32>& ksize,
48                                const std::vector<int32>& stride,
49                                Padding padding, TensorFormat data_format,
50                                const TensorShape& tensor_in_shape) {
51   // For maxpooling, tensor_in should have 2 spatial dimensions.
52   // Note: the total number of dimensions could be 4 for NHWC, NCHW,
53   // or 5 for NCHW_VECT_C.
54   OP_REQUIRES(context,
55               GetTensorSpatialDims(tensor_in_shape.dims(), data_format) == 2,
56               errors::InvalidArgument(
57                   "tensor_in_shape must have 2 spatial dimensions. ",
58                   tensor_in_shape.dims(), " ", data_format));
59 
60   this->data_format = data_format;
61   depth = GetTensorDim(tensor_in_shape, data_format, 'C') *
62           (data_format == FORMAT_NCHW_VECT_C ? 4 : 1);
63   tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
64   tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
65   tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
66   window_rows = GetTensorDim(ksize, data_format, 'H');
67   window_cols = GetTensorDim(ksize, data_format, 'W');
68   depth_window = GetTensorDim(ksize, data_format, 'C');
69   row_stride = GetTensorDim(stride, data_format, 'H');
70   col_stride = GetTensorDim(stride, data_format, 'W');
71   depth_stride = GetTensorDim(stride, data_format, 'C');
72 
73   // We only support 2D pooling across width/height and depthwise
74   // pooling, not a combination.
75   OP_REQUIRES(context,
76               (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
77               errors::Unimplemented(
78                   "MaxPooling supports exactly one of pooling across depth "
79                   "or pooling across width/height."));
80 
81   if (depth_window == 1) {
82     OP_REQUIRES_OK(
83         context, GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
84                                        padding, &out_height, &pad_rows));
85     OP_REQUIRES_OK(
86         context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
87                                        padding, &out_width, &pad_cols));
88     pad_depth = 0;
89     out_depth = depth;
90   } else {
91     // Our current version of depthwise max pooling does not support
92     // any padding, and expects the depth_window to equal the
93     // depth_stride (no overlapping).
94     OP_REQUIRES(
95         context, depth % depth_window == 0,
96         errors::Unimplemented("Depthwise max pooling requires the depth "
97                               "window to evenly divide the input depth"));
98     OP_REQUIRES(
99         context, depth_stride == depth_window,
100         errors::Unimplemented("Depthwise max pooling requires the depth "
101                               "window to equal the depth stride"));
102 
103     // The current version of depthwise max is only implemented on CPU.
104     OP_REQUIRES(context,
105                 (DeviceType(static_cast<Device*>(context->device())
106                                 ->attributes()
107                                 .device_type()) == DeviceType(DEVICE_CPU)),
108                 errors::Unimplemented("Depthwise max pooling is currently "
109                                       "only implemented for CPU devices."));
110 
111     pad_depth = 0;
112     out_depth = depth / depth_window;
113   }
114 }
115 
forward_output_shape()116 TensorShape PoolParameters::forward_output_shape() {
117   if (depth_window == 1) {
118     // Spatial pooling
119     return ShapeFromFormat(data_format, tensor_in_batch, out_height, out_width,
120                            depth);
121   } else {
122     // Depthwise pooling
123     return TensorShape(
124         {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
125   }
126 }
127 
128 #ifdef GOOGLE_CUDA
129 
130 namespace {
131 template <typename T>
AsDeviceMemory(const T * cuda_memory,uint64 size)132 se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
133   se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
134   se::DeviceMemory<T> typed(wrapped);
135   return typed;
136 }
137 }  // namespace
138 
139 // Forward declarations of the functor specializations for GPU.
140 namespace functor {
141 #define DECLARE_GPU_SPEC(T)                                         \
142   template <>                                                       \
143   void TransformDepth<GPUDevice, T, Eigen::DenseIndex>::operator()( \
144       const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,    \
145       const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,           \
146       typename TTypes<T, 4>::Tensor out);                           \
147   extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
148 
149 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
150 #undef DECLARE_GPU_SPEC
151 }  // namespace functor
152 
153 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const Tensor & tensor_in,const TensorShape & tensor_out_shape,bool propagate_nans)154 void DnnPoolingOp<T>::Compute(OpKernelContext* context,
155                               se::dnn::PoolingMode pooling_mode,
156                               const std::vector<int32>& size,
157                               const std::vector<int32>& stride, Padding padding,
158                               TensorFormat data_format, const Tensor& tensor_in,
159                               const TensorShape& tensor_out_shape,
160                               bool propagate_nans) {
161   Tensor* tensor_out = nullptr;
162   OP_REQUIRES_OK(context,
163                  context->allocate_output(0, tensor_out_shape, &tensor_out));
164   if (tensor_in.shape().num_elements() == 0) {
165     return;
166   }
167 
168   PoolParameters params{context, size,        stride,
169                         padding, data_format, tensor_in.shape()};
170   if (!context->status().ok()) {
171     return;
172   }
173 
174   int batch_size = params.tensor_in_batch;
175   int depth = params.depth;
176 #if CUDNN_VERSION < 7300
177   /// Earlier versions do not support NHWC format, so we need to convert it
178   /// to NCHW before calling cudnn. We need to get rid of this once it is done
179   Tensor transformed_input;
180   if (data_format == FORMAT_NHWC) {
181     OP_REQUIRES_OK(context, context->allocate_temp(
182                                 DataTypeToEnum<T>::value,
183                                 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
184                                                 data_format),
185                                 &transformed_input));
186     functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
187                                            tensor_in.tensor<T, 4>(),
188                                            transformed_input.tensor<T, 4>());
189   } else {
190     transformed_input = tensor_in;
191   }
192   Tensor transformed_output;
193   if (data_format == FORMAT_NHWC) {
194     OP_REQUIRES_OK(context, context->allocate_temp(
195                                 DataTypeToEnum<T>::value,
196                                 ShapeFromFormat(FORMAT_NCHW, tensor_out_shape,
197                                                 data_format),
198                                 &transformed_output));
199   } else {
200     transformed_output = *tensor_out;
201   }
202   se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
203 #else
204   auto& transformed_input = tensor_in;
205   auto& transformed_output = *tensor_out;
206   se::dnn::DataLayout data_layout;
207   switch (data_format) {
208     case FORMAT_NHWC:
209       data_layout = se::dnn::DataLayout::kBatchYXDepth;
210       break;
211     case FORMAT_NCHW:
212       data_layout = se::dnn::DataLayout::kBatchDepthYX;
213       break;
214     case FORMAT_NCHW_VECT_C:
215       // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
216       // emulated via NHWC.
217       data_layout = se::dnn::DataLayout::kBatchYXDepth;
218       batch_size *= depth / 4;
219       depth = 4;
220       break;
221     default:
222       OP_REQUIRES(context, false,
223                   errors::InvalidArgument("Unsupported format: ",
224                                           ToString(data_format)));
225   }
226 #endif
227   /// Get ready to call cudnn
228   se::dnn::PoolingDescriptor pooling_desc;
229   pooling_desc.set_pooling_mode(pooling_mode)
230       .set_window_height(params.window_rows)
231       .set_window_width(params.window_cols)
232       .set_vertical_stride(params.row_stride)
233       .set_horizontal_stride(params.col_stride)
234       .set_vertical_padding(params.pad_rows)
235       .set_horizontal_padding(params.pad_cols)
236       .set_propagate_nans(propagate_nans);
237 
238   se::dnn::BatchDescriptor input_desc;
239   input_desc.set_count(batch_size)
240       .set_height(params.tensor_in_rows)
241       .set_width(params.tensor_in_cols)
242       .set_feature_map_count(depth)
243       .set_layout(data_layout);
244 
245   se::dnn::BatchDescriptor output_desc;
246   output_desc.set_count(batch_size)
247       .set_height(params.out_height)
248       .set_width(params.out_width)
249       .set_feature_map_count(depth)
250       .set_layout(data_layout);
251 
252   auto input_data =
253       AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
254                          transformed_input.template flat<T>().data()),
255                      transformed_input.template flat<T>().size());
256 
257   auto output_data =
258       AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
259                          transformed_output.template flat<T>().data()),
260                      transformed_output.template flat<T>().size());
261 
262   auto* stream = context->op_device_context()->stream();
263   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
264 
265   bool status = stream
266                     ->ThenPoolForward(pooling_desc, input_desc, input_data,
267                                       output_desc, &output_data)
268                     .ok();
269   OP_REQUIRES(context, status,
270               errors::Internal("cudnn PoolForward launch failed"));
271 #if CUDNN_VERSION < 7300
272   if (data_format == FORMAT_NHWC) {
273     /// Transform the output data from NCHW back to NHWC
274     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
275     using RT = typename RawType<T>::type;
276     functor::NCHWToNHWC<GPUDevice, RT, 4>()(
277         context->eigen_device<Device>(),
278         toConstTensor(transformed_output).template tensor<RT, 4>(),
279         tensor_out->tensor<RT, 4>());
280   }
281 #endif
282 }
283 
284 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const Tensor * tensor_in,const Tensor * tensor_out,const Tensor & out_backprop,const TensorShape & tensor_in_shape,bool propagate_nans)285 void DnnPoolingGradOp<T>::Compute(
286     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
287     const std::vector<int32>& size, const std::vector<int32>& stride,
288     Padding padding, TensorFormat data_format, const Tensor* tensor_in,
289     const Tensor* tensor_out, const Tensor& out_backprop,
290     const TensorShape& tensor_in_shape, bool propagate_nans) {
291   CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
292         (tensor_in && tensor_out))
293       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
294          "specified";
295 
296   Tensor* input_backprop = nullptr;
297   OP_REQUIRES_OK(context,
298                  context->allocate_output(0, tensor_in_shape, &input_backprop));
299   if (tensor_in_shape.num_elements() == 0) {
300     return;
301   }
302 
303   PoolParameters params{context, size,        stride,
304                         padding, data_format, tensor_in_shape};
305   if (!context->status().ok()) {
306     return;
307   }
308 
309   /// For now, cudnn does not support NHWC format, so we need to convert it
310   /// to NCHW before calling cudnn. We need to get rid of this once it is done
311   Tensor transformed_input;
312   TensorShape transformed_input_shape;
313   if (data_format == FORMAT_NHWC || !tensor_in) {
314     transformed_input_shape =
315         ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
316     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
317                                                    transformed_input_shape,
318                                                    &transformed_input));
319   } else {
320     transformed_input = *tensor_in;
321   }
322   Tensor transformed_output;
323   TensorShape transformed_output_shape;
324   if (data_format == FORMAT_NHWC || !tensor_out) {
325     transformed_output_shape =
326         ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
327     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
328                                                    transformed_output_shape,
329                                                    &transformed_output));
330   } else {
331     transformed_output = *tensor_out;
332   }
333   Tensor transformed_input_backprop;
334   if (data_format == FORMAT_NHWC) {
335     OP_REQUIRES_OK(context,
336                    context->allocate_temp(DataTypeToEnum<T>::value,
337                                           transformed_input_shape,
338                                           &transformed_input_backprop));
339   } else {
340     transformed_input_backprop = *input_backprop;
341   }
342   Tensor transformed_output_backprop;
343   if (data_format == FORMAT_NHWC) {
344     OP_REQUIRES_OK(context,
345                    context->allocate_temp(DataTypeToEnum<T>::value,
346                                           transformed_output_shape,
347                                           &transformed_output_backprop));
348   } else {
349     transformed_output_backprop = out_backprop;
350   }
351 
352   if (data_format == FORMAT_NHWC) {
353     /// Convert the data from NHWC to NCHW if necessary.
354     if (tensor_in) {
355       // For AvgPoolGrad, the original input tensor is not necessary. However,
356       // cudnn still requires them to run, although they do not affect the
357       // results.
358       functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
359                                              tensor_in->tensor<T, 4>(),
360                                              transformed_input.tensor<T, 4>());
361     }
362     if (tensor_out) {
363       // For AvgPoolGrad, the original output tensor is not necessary. However,
364       // cudnn still requires them to run, although they do not affect the
365       // results.
366       functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
367                                              tensor_out->tensor<T, 4>(),
368                                              transformed_output.tensor<T, 4>());
369     }
370     functor::NHWCToNCHW<GPUDevice, T, 4>()(
371         context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
372         transformed_output_backprop.tensor<T, 4>());
373   }
374 
375   /// Get ready to call cudnn
376   se::dnn::PoolingDescriptor pooling_desc;
377   pooling_desc.set_pooling_mode(pooling_mode)
378       .set_window_height(params.window_rows)
379       .set_window_width(params.window_cols)
380       .set_vertical_stride(params.row_stride)
381       .set_horizontal_stride(params.col_stride)
382       .set_vertical_padding(params.pad_rows)
383       .set_horizontal_padding(params.pad_cols)
384       .set_propagate_nans(propagate_nans);
385 
386   se::dnn::BatchDescriptor orig_output_desc;
387   orig_output_desc.set_count(params.tensor_in_batch)
388       .set_height(params.out_height)
389       .set_width(params.out_width)
390       .set_feature_map_count(params.depth)
391       .set_layout(se::dnn::DataLayout::kBatchDepthYX);
392 
393   se::dnn::BatchDescriptor orig_input_desc;
394   orig_input_desc.set_count(params.tensor_in_batch)
395       .set_height(params.tensor_in_rows)
396       .set_width(params.tensor_in_cols)
397       .set_feature_map_count(params.depth)
398       .set_layout(se::dnn::DataLayout::kBatchDepthYX);
399 
400   auto orig_output_data =
401       AsDeviceMemory(transformed_output.template flat<T>().data(),
402                      transformed_output.template flat<T>().size());
403   auto orig_input_data =
404       AsDeviceMemory(transformed_input.template flat<T>().data(),
405                      transformed_input.template flat<T>().size());
406   auto output_backprop_data =
407       AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
408                      transformed_output_backprop.template flat<T>().size());
409   auto input_backprop_data =
410       AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
411                      transformed_input_backprop.template flat<T>().size());
412 
413   auto* stream = context->op_device_context()->stream();
414   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
415 
416   bool status =
417       stream
418           ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
419                              orig_output_desc, orig_output_data,
420                              output_backprop_data, &input_backprop_data)
421           .ok();
422   OP_REQUIRES(context, status,
423               errors::Internal("cudnn PoolBackward launch failed"));
424 
425   if (data_format == FORMAT_NHWC) {
426     /// Transform the output data from NCHW back to NHWC.
427     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
428     functor::NCHWToNHWC<GPUDevice, T, 4>()(
429         context->eigen_device<Device>(),
430         toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
431         input_backprop->tensor<T, 4>());
432   }
433 }
434 
435 #define DEFINE_DNN_OPS(T)         \
436   template class DnnPoolingOp<T>; \
437   template class DnnPoolingGradOp<T>;
438 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
439 
440 #if CUDNN_VERSION >= 7300
441 template class DnnPoolingOp<qint8>;
442 #endif
443 
444 #undef DEFINE_DNN_OPS
445 
446 #endif  // GOOGLE_CUDA
447 
448 }  // namespace tensorflow
449