• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define USE_EIGEN_TENSOR
17 #define EIGEN_USE_THREADS
18 
19 #include <array>
20 
21 #include "tensorflow/core/framework/register_types.h"
22 #include "tensorflow/core/kernels/conv_2d.h"
23 #include "tensorflow/core/kernels/conv_3d.h"
24 #include "tensorflow/core/kernels/conv_ops_gpu.h"
25 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
26 
27 typedef Eigen::GpuDevice GPUDevice;
28 
29 namespace tensorflow {
30 
31 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
32 
33 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::array<int64,3> & window,const std::array<int64,3> & stride,const std::array<int64,3> & padding,TensorFormat data_format,const Tensor & tensor_in,Tensor * output)34 void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
35                                 se::dnn::PoolingMode pooling_mode,
36                                 const std::array<int64, 3>& window,
37                                 const std::array<int64, 3>& stride,
38                                 const std::array<int64, 3>& padding,
39                                 TensorFormat data_format,
40                                 const Tensor& tensor_in, Tensor* output) {
41   const auto in_shape = tensor_in.shape();
42   const auto out_shape = output->shape();
43 
44   const int64_t in_batch = GetTensorDim(tensor_in, data_format, 'N');
45   const int64_t in_features = GetTensorDim(tensor_in, data_format, 'C');
46 
47   Tensor transformed_input;
48   if (data_format == FORMAT_NHWC) {
49     OP_REQUIRES_OK(context, context->allocate_temp(
50                                 DataTypeToEnum<T>::value,
51                                 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
52                                                 data_format),
53                                 &transformed_input));
54     functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
55                                            tensor_in.tensor<T, 5>(),
56                                            transformed_input.tensor<T, 5>());
57   } else {
58     transformed_input = tensor_in;
59   }
60   Tensor transformed_output;
61   if (data_format == FORMAT_NHWC) {
62     OP_REQUIRES_OK(context,
63                    context->allocate_temp(
64                        DataTypeToEnum<T>::value,
65                        ShapeFromFormat(FORMAT_NCHW, out_shape, data_format),
66                        &transformed_output));
67   } else {
68     transformed_output = *output;
69   }
70 
71   se::dnn::PoolingDescriptor pooling_desc(3);
72   pooling_desc.set_pooling_mode(pooling_mode);
73   se::dnn::BatchDescriptor input_desc(3);
74   input_desc.set_count(in_batch)
75       .set_feature_map_count(in_features)
76       .set_layout(se::dnn::DataLayout::kBatchDepthYX);
77   se::dnn::BatchDescriptor output_desc(3);
78   output_desc.set_count(in_batch)
79       .set_feature_map_count(in_features)
80       .set_layout(se::dnn::DataLayout::kBatchDepthYX);
81   for (size_t i = 0; i < window.size(); ++i) {
82     const auto dim_i = static_cast<se::dnn::DimIndex>(i);
83     pooling_desc.set_window(dim_i, window[i]);
84     pooling_desc.set_stride(dim_i, stride[i]);
85     pooling_desc.set_padding(dim_i, padding[i]);
86     input_desc.set_spatial_dim(dim_i,
87                                GetTensorDim(tensor_in, data_format, '2' - i));
88     output_desc.set_spatial_dim(dim_i,
89                                 GetTensorDim(out_shape, data_format, '2' - i));
90   }
91 
92   auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
93                                    transformed_input.template flat<T>().size());
94   auto output_data =
95       AsDeviceMemory(transformed_output.template flat<T>().data(),
96                      transformed_output.template flat<T>().size());
97 
98   auto* stream = context->op_device_context()->stream();
99   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
100 
101 #if TENSORFLOW_USE_ROCM
102   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
103       // default value is in bytes despite the name of the environment variable
104       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
105   );
106 
107   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
108   bool status =
109       stream
110           ->ThenPoolForward(pooling_desc, input_desc, input_data, output_desc,
111                             &output_data, &scratch_allocator)
112           .ok();
113 #else
114   bool status = stream
115                     ->ThenPoolForward(pooling_desc, input_desc, input_data,
116                                       output_desc, &output_data)
117                     .ok();
118 #endif
119 
120   OP_REQUIRES(context, status,
121               errors::Internal("dnn PoolForward launch failed"));
122 
123   if (data_format == FORMAT_NHWC) {
124     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
125     functor::NCHWToNHWC<GPUDevice, T, 5>()(
126         context->eigen_device<GPUDevice>(),
127         toConstTensor(transformed_output).template tensor<T, 5>(),
128         output->tensor<T, 5>());
129   }
130 }
131 
132 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::array<int64,3> & window,const std::array<int64,3> & stride,const std::array<int64,3> & padding,const std::array<int64,3> & output_size,TensorFormat data_format,const Tensor & out_backprop,const TensorShape & tensor_in_shape,const Tensor * tensor_in,const Tensor * tensor_out,Tensor * input_backprop)133 void DnnPooling3dGradOp<T>::Compute(
134     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
135     const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
136     const std::array<int64, 3>& padding,
137     const std::array<int64, 3>& output_size, TensorFormat data_format,
138     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
139     const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
140   CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
141         (tensor_in && tensor_out))
142       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
143          "specified";
144 
145   const int64_t in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
146   const int64_t in_features = GetTensorDim(tensor_in_shape, data_format, 'C');
147 
148   Tensor transformed_input;
149   TensorShape transformed_input_shape;
150   if (data_format == FORMAT_NHWC || tensor_in == nullptr) {
151     transformed_input_shape =
152         ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
153     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
154                                                    transformed_input_shape,
155                                                    &transformed_input));
156   } else {
157     transformed_input = *tensor_in;
158   }
159   Tensor transformed_output;
160   TensorShape transformed_output_shape;
161   if (data_format == FORMAT_NHWC || tensor_out == nullptr) {
162     transformed_output_shape =
163         ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
164     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
165                                                    transformed_output_shape,
166                                                    &transformed_output));
167   } else {
168     transformed_output = *tensor_out;
169   }
170   Tensor transformed_input_backprop;
171   if (data_format == FORMAT_NHWC) {
172     OP_REQUIRES_OK(context,
173                    context->allocate_temp(DataTypeToEnum<T>::value,
174                                           transformed_input_shape,
175                                           &transformed_input_backprop));
176   } else {
177     transformed_input_backprop = *input_backprop;
178   }
179   Tensor transformed_output_backprop;
180   if (data_format == FORMAT_NHWC) {
181     OP_REQUIRES_OK(context,
182                    context->allocate_temp(DataTypeToEnum<T>::value,
183                                           transformed_output_shape,
184                                           &transformed_output_backprop));
185   } else {
186     transformed_output_backprop = out_backprop;
187   }
188   if (data_format == FORMAT_NHWC) {
189     if (tensor_in != nullptr) {
190       functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
191                                              tensor_in->tensor<T, 5>(),
192                                              transformed_input.tensor<T, 5>());
193     }
194     if (tensor_out != nullptr) {
195       functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
196                                              tensor_out->tensor<T, 5>(),
197                                              transformed_output.tensor<T, 5>());
198     }
199     functor::NHWCToNCHW<GPUDevice, T, 5>()(
200         context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
201         transformed_output_backprop.tensor<T, 5>());
202   }
203 
204   se::dnn::PoolingDescriptor pooling_desc(3);
205   pooling_desc.set_pooling_mode(pooling_mode);
206 
207   se::dnn::BatchDescriptor orig_output_desc(3);
208   orig_output_desc.set_count(in_batch)
209       .set_feature_map_count(in_features)
210       .set_layout(se::dnn::DataLayout::kBatchDepthYX);
211 
212   se::dnn::BatchDescriptor orig_input_desc(3);
213   orig_input_desc.set_count(in_batch)
214       .set_feature_map_count(in_features)
215       .set_layout(se::dnn::DataLayout::kBatchDepthYX);
216 
217   for (size_t i = 0; i < window.size(); ++i) {
218     const auto dim_i = static_cast<se::dnn::DimIndex>(i);
219     pooling_desc.set_window(dim_i, window[i]);
220     pooling_desc.set_stride(dim_i, stride[i]);
221     pooling_desc.set_padding(dim_i, padding[i]);
222     orig_input_desc.set_spatial_dim(
223         dim_i, GetTensorDim(tensor_in_shape, data_format, '2' - i));
224     orig_output_desc.set_spatial_dim(dim_i, output_size[i]);
225   }
226 
227   auto orig_output_data =
228       AsDeviceMemory(transformed_output.template flat<T>().data(),
229                      transformed_output.template flat<T>().size());
230   auto orig_input_data =
231       AsDeviceMemory(transformed_input.template flat<T>().data(),
232                      transformed_input.template flat<T>().size());
233   auto output_backprop_data =
234       AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
235                      transformed_output_backprop.template flat<T>().size());
236   auto input_backprop_data =
237       AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
238                      transformed_input_backprop.template flat<T>().size());
239 
240   auto* stream = context->op_device_context()->stream();
241   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
242 
243 #if TENSORFLOW_USE_ROCM
244   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
245       // default value is in bytes despite the name of the environment variable
246       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
247   );
248 
249   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
250   bool status = stream
251                     ->ThenPoolBackward(pooling_desc, orig_input_desc,
252                                        orig_input_data, orig_output_desc,
253                                        orig_output_data, output_backprop_data,
254                                        &input_backprop_data, &scratch_allocator)
255                     .ok();
256 #else
257   bool status =
258       stream
259           ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
260                              orig_output_desc, orig_output_data,
261                              output_backprop_data, &input_backprop_data)
262           .ok();
263 #endif
264 
265   OP_REQUIRES(context, status,
266               errors::Internal("dnn PoolBackward launch failed"));
267 
268   if (data_format == FORMAT_NHWC) {
269     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
270     functor::NCHWToNHWC<GPUDevice, T, 5>()(
271         context->eigen_device<GPUDevice>(),
272         toConstTensor(transformed_input_backprop).template tensor<T, 5>(),
273         input_backprop->tensor<T, 5>());
274   }
275 }
276 
277 #define DEFINE_DNN_OPS(T)           \
278   template class DnnPooling3dOp<T>; \
279   template class DnnPooling3dGradOp<T>;
280 TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
281 #undef DEFINE_DNN_OPS
282 
283 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
284 
285 }  // namespace tensorflow
286