1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #define USE_EIGEN_TENSOR
17 #define EIGEN_USE_THREADS
18
19 #include <array>
20
21 #include "tensorflow/core/framework/register_types.h"
22 #include "tensorflow/core/kernels/conv_2d.h"
23 #include "tensorflow/core/kernels/conv_3d.h"
24 #include "tensorflow/core/kernels/conv_ops_gpu.h"
25 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
26
27 typedef Eigen::GpuDevice GPUDevice;
28
29 namespace tensorflow {
30
31 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
32
33 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::array<int64,3> & window,const std::array<int64,3> & stride,const std::array<int64,3> & padding,TensorFormat data_format,const Tensor & tensor_in,Tensor * output)34 void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
35 se::dnn::PoolingMode pooling_mode,
36 const std::array<int64, 3>& window,
37 const std::array<int64, 3>& stride,
38 const std::array<int64, 3>& padding,
39 TensorFormat data_format,
40 const Tensor& tensor_in, Tensor* output) {
41 const auto in_shape = tensor_in.shape();
42 const auto out_shape = output->shape();
43
44 const int64_t in_batch = GetTensorDim(tensor_in, data_format, 'N');
45 const int64_t in_features = GetTensorDim(tensor_in, data_format, 'C');
46
47 Tensor transformed_input;
48 if (data_format == FORMAT_NHWC) {
49 OP_REQUIRES_OK(context, context->allocate_temp(
50 DataTypeToEnum<T>::value,
51 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
52 data_format),
53 &transformed_input));
54 functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
55 tensor_in.tensor<T, 5>(),
56 transformed_input.tensor<T, 5>());
57 } else {
58 transformed_input = tensor_in;
59 }
60 Tensor transformed_output;
61 if (data_format == FORMAT_NHWC) {
62 OP_REQUIRES_OK(context,
63 context->allocate_temp(
64 DataTypeToEnum<T>::value,
65 ShapeFromFormat(FORMAT_NCHW, out_shape, data_format),
66 &transformed_output));
67 } else {
68 transformed_output = *output;
69 }
70
71 se::dnn::PoolingDescriptor pooling_desc(3);
72 pooling_desc.set_pooling_mode(pooling_mode);
73 se::dnn::BatchDescriptor input_desc(3);
74 input_desc.set_count(in_batch)
75 .set_feature_map_count(in_features)
76 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
77 se::dnn::BatchDescriptor output_desc(3);
78 output_desc.set_count(in_batch)
79 .set_feature_map_count(in_features)
80 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
81 for (size_t i = 0; i < window.size(); ++i) {
82 const auto dim_i = static_cast<se::dnn::DimIndex>(i);
83 pooling_desc.set_window(dim_i, window[i]);
84 pooling_desc.set_stride(dim_i, stride[i]);
85 pooling_desc.set_padding(dim_i, padding[i]);
86 input_desc.set_spatial_dim(dim_i,
87 GetTensorDim(tensor_in, data_format, '2' - i));
88 output_desc.set_spatial_dim(dim_i,
89 GetTensorDim(out_shape, data_format, '2' - i));
90 }
91
92 auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
93 transformed_input.template flat<T>().size());
94 auto output_data =
95 AsDeviceMemory(transformed_output.template flat<T>().data(),
96 transformed_output.template flat<T>().size());
97
98 auto* stream = context->op_device_context()->stream();
99 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
100
101 #if TENSORFLOW_USE_ROCM
102 static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
103 // default value is in bytes despite the name of the environment variable
104 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
105 );
106
107 DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
108 bool status =
109 stream
110 ->ThenPoolForward(pooling_desc, input_desc, input_data, output_desc,
111 &output_data, &scratch_allocator)
112 .ok();
113 #else
114 bool status = stream
115 ->ThenPoolForward(pooling_desc, input_desc, input_data,
116 output_desc, &output_data)
117 .ok();
118 #endif
119
120 OP_REQUIRES(context, status,
121 errors::Internal("dnn PoolForward launch failed"));
122
123 if (data_format == FORMAT_NHWC) {
124 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
125 functor::NCHWToNHWC<GPUDevice, T, 5>()(
126 context->eigen_device<GPUDevice>(),
127 toConstTensor(transformed_output).template tensor<T, 5>(),
128 output->tensor<T, 5>());
129 }
130 }
131
132 template <typename T>
Compute(OpKernelContext * context,se::dnn::PoolingMode pooling_mode,const std::array<int64,3> & window,const std::array<int64,3> & stride,const std::array<int64,3> & padding,const std::array<int64,3> & output_size,TensorFormat data_format,const Tensor & out_backprop,const TensorShape & tensor_in_shape,const Tensor * tensor_in,const Tensor * tensor_out,Tensor * input_backprop)133 void DnnPooling3dGradOp<T>::Compute(
134 OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
135 const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
136 const std::array<int64, 3>& padding,
137 const std::array<int64, 3>& output_size, TensorFormat data_format,
138 const Tensor& out_backprop, const TensorShape& tensor_in_shape,
139 const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
140 CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
141 (tensor_in && tensor_out))
142 << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
143 "specified";
144
145 const int64_t in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
146 const int64_t in_features = GetTensorDim(tensor_in_shape, data_format, 'C');
147
148 Tensor transformed_input;
149 TensorShape transformed_input_shape;
150 if (data_format == FORMAT_NHWC || tensor_in == nullptr) {
151 transformed_input_shape =
152 ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
153 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
154 transformed_input_shape,
155 &transformed_input));
156 } else {
157 transformed_input = *tensor_in;
158 }
159 Tensor transformed_output;
160 TensorShape transformed_output_shape;
161 if (data_format == FORMAT_NHWC || tensor_out == nullptr) {
162 transformed_output_shape =
163 ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
164 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
165 transformed_output_shape,
166 &transformed_output));
167 } else {
168 transformed_output = *tensor_out;
169 }
170 Tensor transformed_input_backprop;
171 if (data_format == FORMAT_NHWC) {
172 OP_REQUIRES_OK(context,
173 context->allocate_temp(DataTypeToEnum<T>::value,
174 transformed_input_shape,
175 &transformed_input_backprop));
176 } else {
177 transformed_input_backprop = *input_backprop;
178 }
179 Tensor transformed_output_backprop;
180 if (data_format == FORMAT_NHWC) {
181 OP_REQUIRES_OK(context,
182 context->allocate_temp(DataTypeToEnum<T>::value,
183 transformed_output_shape,
184 &transformed_output_backprop));
185 } else {
186 transformed_output_backprop = out_backprop;
187 }
188 if (data_format == FORMAT_NHWC) {
189 if (tensor_in != nullptr) {
190 functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
191 tensor_in->tensor<T, 5>(),
192 transformed_input.tensor<T, 5>());
193 }
194 if (tensor_out != nullptr) {
195 functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
196 tensor_out->tensor<T, 5>(),
197 transformed_output.tensor<T, 5>());
198 }
199 functor::NHWCToNCHW<GPUDevice, T, 5>()(
200 context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
201 transformed_output_backprop.tensor<T, 5>());
202 }
203
204 se::dnn::PoolingDescriptor pooling_desc(3);
205 pooling_desc.set_pooling_mode(pooling_mode);
206
207 se::dnn::BatchDescriptor orig_output_desc(3);
208 orig_output_desc.set_count(in_batch)
209 .set_feature_map_count(in_features)
210 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
211
212 se::dnn::BatchDescriptor orig_input_desc(3);
213 orig_input_desc.set_count(in_batch)
214 .set_feature_map_count(in_features)
215 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
216
217 for (size_t i = 0; i < window.size(); ++i) {
218 const auto dim_i = static_cast<se::dnn::DimIndex>(i);
219 pooling_desc.set_window(dim_i, window[i]);
220 pooling_desc.set_stride(dim_i, stride[i]);
221 pooling_desc.set_padding(dim_i, padding[i]);
222 orig_input_desc.set_spatial_dim(
223 dim_i, GetTensorDim(tensor_in_shape, data_format, '2' - i));
224 orig_output_desc.set_spatial_dim(dim_i, output_size[i]);
225 }
226
227 auto orig_output_data =
228 AsDeviceMemory(transformed_output.template flat<T>().data(),
229 transformed_output.template flat<T>().size());
230 auto orig_input_data =
231 AsDeviceMemory(transformed_input.template flat<T>().data(),
232 transformed_input.template flat<T>().size());
233 auto output_backprop_data =
234 AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
235 transformed_output_backprop.template flat<T>().size());
236 auto input_backprop_data =
237 AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
238 transformed_input_backprop.template flat<T>().size());
239
240 auto* stream = context->op_device_context()->stream();
241 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
242
243 #if TENSORFLOW_USE_ROCM
244 static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
245 // default value is in bytes despite the name of the environment variable
246 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
247 );
248
249 DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
250 bool status = stream
251 ->ThenPoolBackward(pooling_desc, orig_input_desc,
252 orig_input_data, orig_output_desc,
253 orig_output_data, output_backprop_data,
254 &input_backprop_data, &scratch_allocator)
255 .ok();
256 #else
257 bool status =
258 stream
259 ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
260 orig_output_desc, orig_output_data,
261 output_backprop_data, &input_backprop_data)
262 .ok();
263 #endif
264
265 OP_REQUIRES(context, status,
266 errors::Internal("dnn PoolBackward launch failed"));
267
268 if (data_format == FORMAT_NHWC) {
269 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
270 functor::NCHWToNHWC<GPUDevice, T, 5>()(
271 context->eigen_device<GPUDevice>(),
272 toConstTensor(transformed_input_backprop).template tensor<T, 5>(),
273 input_backprop->tensor<T, 5>());
274 }
275 }
276
277 #define DEFINE_DNN_OPS(T) \
278 template class DnnPooling3dOp<T>; \
279 template class DnnPooling3dGradOp<T>;
280 TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
281 #undef DEFINE_DNN_OPS
282
283 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
284
285 } // namespace tensorflow
286