1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/kernels/pooling_ops_common.h"
17
18 #include <vector>
19 #include "tensorflow/core/common_runtime/device.h"
20 #include "tensorflow/core/framework/register_types.h"
21 #include "tensorflow/core/framework/tensor.h"
22
23 #if GOOGLE_CUDA
24 #include "cuda/include/cudnn.h"
25 #include "tensorflow/core/kernels/conv_2d.h"
26 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
27 #include "tensorflow/core/platform/stream_executor.h"
28 #endif // GOOGLE_CUDA
29
30 namespace tensorflow {
31
32 namespace {
33
34 template <typename T>
35 struct RawType {
36 using type = T;
37 };
38
39 template <>
40 struct RawType<qint8> {
41 using type = int8;
42 };
43
44 } // namespace
45
PoolParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const TensorShape & tensor_in_shape)46 PoolParameters::PoolParameters(OpKernelContext* context,
47 const std::vector<int32>& ksize,
48 const std::vector<int32>& stride,
49 Padding padding, TensorFormat data_format,
50 const TensorShape& tensor_in_shape) {
51 // For maxpooling, tensor_in should have 2 spatial dimensions.
52 // Note: the total number of dimensions could be 4 for NHWC, NCHW,
53 // or 5 for NCHW_VECT_C.
54 OP_REQUIRES(context,
55 GetTensorSpatialDims(tensor_in_shape.dims(), data_format) == 2,
56 errors::InvalidArgument(
57 "tensor_in_shape must have 2 spatial dimensions. ",
58 tensor_in_shape.dims(), " ", data_format));
59
60 this->data_format = data_format;
61 depth = GetTensorDim(tensor_in_shape, data_format, 'C') *
62 (data_format == FORMAT_NCHW_VECT_C ? 4 : 1);
63 tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
64 tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
65 tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
66 window_rows = GetTensorDim(ksize, data_format, 'H');
67 window_cols = GetTensorDim(ksize, data_format, 'W');
68 depth_window = GetTensorDim(ksize, data_format, 'C');
69 row_stride = GetTensorDim(stride, data_format, 'H');
70 col_stride = GetTensorDim(stride, data_format, 'W');
71 depth_stride = GetTensorDim(stride, data_format, 'C');
72
73 // We only support 2D pooling across width/height and depthwise
74 // pooling, not a combination.
75 OP_REQUIRES(context,
76 (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
77 errors::Unimplemented(
78 "MaxPooling supports exactly one of pooling across depth "
79 "or pooling across width/height."));
80
81 if (depth_window == 1) {
82 OP_REQUIRES_OK(
83 context, GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
84 padding, &out_height, &pad_rows));
85 OP_REQUIRES_OK(
86 context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
87 padding, &out_width, &pad_cols));
88 pad_depth = 0;
89 out_depth = depth;
90 } else {
91 // Our current version of depthwise max pooling does not support
92 // any padding, and expects the depth_window to equal the
93 // depth_stride (no overlapping).
94 OP_REQUIRES(
95 context, depth % depth_window == 0,
96 errors::Unimplemented("Depthwise max pooling requires the depth "
97 "window to evenly divide the input depth"));
98 OP_REQUIRES(
99 context, depth_stride == depth_window,
100 errors::Unimplemented("Depthwise max pooling requires the depth "
101 "window to equal the depth stride"));
102
103 // The current version of depthwise max is only implemented on CPU.
104 OP_REQUIRES(context,
105 (DeviceType(static_cast<Device*>(context->device())
106 ->attributes()
107 .device_type()) == DeviceType(DEVICE_CPU)),
108 errors::Unimplemented("Depthwise max pooling is currently "
109 "only implemented for CPU devices."));
110
111 pad_depth = 0;
112 out_depth = depth / depth_window;
113 }
114 }
115
forward_output_shape()116 TensorShape PoolParameters::forward_output_shape() {
117 if (depth_window == 1) {
118 // Spatial pooling
119 return ShapeFromFormat(data_format, tensor_in_batch, out_height, out_width,
120 depth);
121 } else {
122 // Depthwise pooling
123 return TensorShape(
124 {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
125 }
126 }
127
128 #ifdef GOOGLE_CUDA
129
130 namespace {
131 template <typename T>
AsDeviceMemory(const T * cuda_memory,uint64 size)132 se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
133 se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
134 se::DeviceMemory<T> typed(wrapped);
135 return typed;
136 }
137 } // namespace
138
139 // Forward declarations of the functor specializations for GPU.
140 namespace functor {
141 #define DECLARE_GPU_SPEC(T) \
142 template <> \
143 void TransformDepth<GPUDevice, T, Eigen::DenseIndex>::operator()( \
144 const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
145 const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \
146 typename TTypes<T, 4>::Tensor out); \
147 extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
148
149 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
150 #undef DECLARE_GPU_SPEC
151 } // namespace functor
152
153 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const Tensor & tensor_in,const TensorShape & tensor_out_shape,bool propagate_nans)154 void DnnPoolingOp<T>::Compute(OpKernelContext* context,
155 se::dnn::PoolingMode pooling_mode,
156 const std::vector<int32>& size,
157 const std::vector<int32>& stride, Padding padding,
158 TensorFormat data_format, const Tensor& tensor_in,
159 const TensorShape& tensor_out_shape,
160 bool propagate_nans) {
161 Tensor* tensor_out = nullptr;
162 OP_REQUIRES_OK(context,
163 context->allocate_output(0, tensor_out_shape, &tensor_out));
164 if (tensor_in.shape().num_elements() == 0) {
165 return;
166 }
167
168 PoolParameters params{context, size, stride,
169 padding, data_format, tensor_in.shape()};
170 if (!context->status().ok()) {
171 return;
172 }
173
174 int batch_size = params.tensor_in_batch;
175 int depth = params.depth;
176 #if CUDNN_VERSION < 7300
177 /// Earlier versions do not support NHWC format, so we need to convert it
178 /// to NCHW before calling cudnn. We need to get rid of this once it is done
179 Tensor transformed_input;
180 if (data_format == FORMAT_NHWC) {
181 OP_REQUIRES_OK(context, context->allocate_temp(
182 DataTypeToEnum<T>::value,
183 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
184 data_format),
185 &transformed_input));
186 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
187 tensor_in.tensor<T, 4>(),
188 transformed_input.tensor<T, 4>());
189 } else {
190 transformed_input = tensor_in;
191 }
192 Tensor transformed_output;
193 if (data_format == FORMAT_NHWC) {
194 OP_REQUIRES_OK(context, context->allocate_temp(
195 DataTypeToEnum<T>::value,
196 ShapeFromFormat(FORMAT_NCHW, tensor_out_shape,
197 data_format),
198 &transformed_output));
199 } else {
200 transformed_output = *tensor_out;
201 }
202 se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
203 #else
204 auto& transformed_input = tensor_in;
205 auto& transformed_output = *tensor_out;
206 se::dnn::DataLayout data_layout;
207 switch (data_format) {
208 case FORMAT_NHWC:
209 data_layout = se::dnn::DataLayout::kBatchYXDepth;
210 break;
211 case FORMAT_NCHW:
212 data_layout = se::dnn::DataLayout::kBatchDepthYX;
213 break;
214 case FORMAT_NCHW_VECT_C:
215 // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
216 // emulated via NHWC.
217 data_layout = se::dnn::DataLayout::kBatchYXDepth;
218 batch_size *= depth / 4;
219 depth = 4;
220 break;
221 default:
222 OP_REQUIRES(context, false,
223 errors::InvalidArgument("Unsupported format: ",
224 ToString(data_format)));
225 }
226 #endif
227 /// Get ready to call cudnn
228 se::dnn::PoolingDescriptor pooling_desc;
229 pooling_desc.set_pooling_mode(pooling_mode)
230 .set_window_height(params.window_rows)
231 .set_window_width(params.window_cols)
232 .set_vertical_stride(params.row_stride)
233 .set_horizontal_stride(params.col_stride)
234 .set_vertical_padding(params.pad_rows)
235 .set_horizontal_padding(params.pad_cols)
236 .set_propagate_nans(propagate_nans);
237
238 se::dnn::BatchDescriptor input_desc;
239 input_desc.set_count(batch_size)
240 .set_height(params.tensor_in_rows)
241 .set_width(params.tensor_in_cols)
242 .set_feature_map_count(depth)
243 .set_layout(data_layout);
244
245 se::dnn::BatchDescriptor output_desc;
246 output_desc.set_count(batch_size)
247 .set_height(params.out_height)
248 .set_width(params.out_width)
249 .set_feature_map_count(depth)
250 .set_layout(data_layout);
251
252 auto input_data =
253 AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
254 transformed_input.template flat<T>().data()),
255 transformed_input.template flat<T>().size());
256
257 auto output_data =
258 AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
259 transformed_output.template flat<T>().data()),
260 transformed_output.template flat<T>().size());
261
262 auto* stream = context->op_device_context()->stream();
263 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
264
265 bool status = stream
266 ->ThenPoolForward(pooling_desc, input_desc, input_data,
267 output_desc, &output_data)
268 .ok();
269 OP_REQUIRES(context, status,
270 errors::Internal("cudnn PoolForward launch failed"));
271 #if CUDNN_VERSION < 7300
272 if (data_format == FORMAT_NHWC) {
273 /// Transform the output data from NCHW back to NHWC
274 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
275 using RT = typename RawType<T>::type;
276 functor::NCHWToNHWC<GPUDevice, RT, 4>()(
277 context->eigen_device<Device>(),
278 toConstTensor(transformed_output).template tensor<RT, 4>(),
279 tensor_out->tensor<RT, 4>());
280 }
281 #endif
282 }
283
284 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::vector<int32> & size,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const Tensor * tensor_in,const Tensor * tensor_out,const Tensor & out_backprop,const TensorShape & tensor_in_shape,bool propagate_nans)285 void DnnPoolingGradOp<T>::Compute(
286 OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
287 const std::vector<int32>& size, const std::vector<int32>& stride,
288 Padding padding, TensorFormat data_format, const Tensor* tensor_in,
289 const Tensor* tensor_out, const Tensor& out_backprop,
290 const TensorShape& tensor_in_shape, bool propagate_nans) {
291 CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
292 (tensor_in && tensor_out))
293 << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
294 "specified";
295
296 Tensor* input_backprop = nullptr;
297 OP_REQUIRES_OK(context,
298 context->allocate_output(0, tensor_in_shape, &input_backprop));
299 if (tensor_in_shape.num_elements() == 0) {
300 return;
301 }
302
303 PoolParameters params{context, size, stride,
304 padding, data_format, tensor_in_shape};
305 if (!context->status().ok()) {
306 return;
307 }
308
309 /// For now, cudnn does not support NHWC format, so we need to convert it
310 /// to NCHW before calling cudnn. We need to get rid of this once it is done
311 Tensor transformed_input;
312 TensorShape transformed_input_shape;
313 if (data_format == FORMAT_NHWC || !tensor_in) {
314 transformed_input_shape =
315 ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
316 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
317 transformed_input_shape,
318 &transformed_input));
319 } else {
320 transformed_input = *tensor_in;
321 }
322 Tensor transformed_output;
323 TensorShape transformed_output_shape;
324 if (data_format == FORMAT_NHWC || !tensor_out) {
325 transformed_output_shape =
326 ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
327 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
328 transformed_output_shape,
329 &transformed_output));
330 } else {
331 transformed_output = *tensor_out;
332 }
333 Tensor transformed_input_backprop;
334 if (data_format == FORMAT_NHWC) {
335 OP_REQUIRES_OK(context,
336 context->allocate_temp(DataTypeToEnum<T>::value,
337 transformed_input_shape,
338 &transformed_input_backprop));
339 } else {
340 transformed_input_backprop = *input_backprop;
341 }
342 Tensor transformed_output_backprop;
343 if (data_format == FORMAT_NHWC) {
344 OP_REQUIRES_OK(context,
345 context->allocate_temp(DataTypeToEnum<T>::value,
346 transformed_output_shape,
347 &transformed_output_backprop));
348 } else {
349 transformed_output_backprop = out_backprop;
350 }
351
352 if (data_format == FORMAT_NHWC) {
353 /// Convert the data from NHWC to NCHW if necessary.
354 if (tensor_in) {
355 // For AvgPoolGrad, the original input tensor is not necessary. However,
356 // cudnn still requires them to run, although they do not affect the
357 // results.
358 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
359 tensor_in->tensor<T, 4>(),
360 transformed_input.tensor<T, 4>());
361 }
362 if (tensor_out) {
363 // For AvgPoolGrad, the original output tensor is not necessary. However,
364 // cudnn still requires them to run, although they do not affect the
365 // results.
366 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
367 tensor_out->tensor<T, 4>(),
368 transformed_output.tensor<T, 4>());
369 }
370 functor::NHWCToNCHW<GPUDevice, T, 4>()(
371 context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
372 transformed_output_backprop.tensor<T, 4>());
373 }
374
375 /// Get ready to call cudnn
376 se::dnn::PoolingDescriptor pooling_desc;
377 pooling_desc.set_pooling_mode(pooling_mode)
378 .set_window_height(params.window_rows)
379 .set_window_width(params.window_cols)
380 .set_vertical_stride(params.row_stride)
381 .set_horizontal_stride(params.col_stride)
382 .set_vertical_padding(params.pad_rows)
383 .set_horizontal_padding(params.pad_cols)
384 .set_propagate_nans(propagate_nans);
385
386 se::dnn::BatchDescriptor orig_output_desc;
387 orig_output_desc.set_count(params.tensor_in_batch)
388 .set_height(params.out_height)
389 .set_width(params.out_width)
390 .set_feature_map_count(params.depth)
391 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
392
393 se::dnn::BatchDescriptor orig_input_desc;
394 orig_input_desc.set_count(params.tensor_in_batch)
395 .set_height(params.tensor_in_rows)
396 .set_width(params.tensor_in_cols)
397 .set_feature_map_count(params.depth)
398 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
399
400 auto orig_output_data =
401 AsDeviceMemory(transformed_output.template flat<T>().data(),
402 transformed_output.template flat<T>().size());
403 auto orig_input_data =
404 AsDeviceMemory(transformed_input.template flat<T>().data(),
405 transformed_input.template flat<T>().size());
406 auto output_backprop_data =
407 AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
408 transformed_output_backprop.template flat<T>().size());
409 auto input_backprop_data =
410 AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
411 transformed_input_backprop.template flat<T>().size());
412
413 auto* stream = context->op_device_context()->stream();
414 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
415
416 bool status =
417 stream
418 ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
419 orig_output_desc, orig_output_data,
420 output_backprop_data, &input_backprop_data)
421 .ok();
422 OP_REQUIRES(context, status,
423 errors::Internal("cudnn PoolBackward launch failed"));
424
425 if (data_format == FORMAT_NHWC) {
426 /// Transform the output data from NCHW back to NHWC.
427 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
428 functor::NCHWToNHWC<GPUDevice, T, 4>()(
429 context->eigen_device<Device>(),
430 toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
431 input_backprop->tensor<T, 4>());
432 }
433 }
434
435 #define DEFINE_DNN_OPS(T) \
436 template class DnnPoolingOp<T>; \
437 template class DnnPoolingGradOp<T>;
438 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
439
440 #if CUDNN_VERSION >= 7300
441 template class DnnPoolingOp<qint8>;
442 #endif
443
444 #undef DEFINE_DNN_OPS
445
446 #endif // GOOGLE_CUDA
447
448 } // namespace tensorflow
449