1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #define EIGEN_USE_THREADS
17
18 #include <algorithm>
19 #include <cmath>
20
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/kernel_shape_util.h"
23 #include "tensorflow/core/framework/numeric_op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_types.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/kernels/cast_op.h"
31 #include "tensorflow/core/kernels/conv_grad_ops.h"
32 #include "tensorflow/core/kernels/depthwise_conv_op.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/platform/logging.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/padding.h"
37 #include "tensorflow/core/util/tensor_format.h"
38 #include "tensorflow/core/util/use_cudnn.h"
39 #include "tensorflow/core/util/work_sharder.h"
40
41 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
42
43 #if GOOGLE_CUDA
44 #include "third_party/gpus/cudnn/cudnn.h"
45 #endif
46
47 #include "tensorflow/core/platform/stream_executor.h"
48 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
49
50 namespace tensorflow {
51
52 // Gradient operations for depthwise convolution.
53
54 typedef Eigen::ThreadPoolDevice CPUDevice;
55 typedef Eigen::GpuDevice GPUDevice;
56
57 // Common code between the two backward pass kernels: verifies that the
58 // dimensions all match and extract the padded rows and columns.
59 #define EXTRACT_AND_VERIFY_DIMENSIONS(label) \
60 const Tensor& out_backprop = context->input(2); \
61 OP_REQUIRES( \
62 context, input_shape.dims() == 4, \
63 errors::InvalidArgument(label, ": input must be 4-dimensional")); \
64 OP_REQUIRES( \
65 context, filter_shape.dims() == 4, \
66 errors::InvalidArgument(label, ": filter must be 4-dimensional")); \
67 OP_REQUIRES( \
68 context, out_backprop.dims() == 4, \
69 errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
70 const int64 batch = input_shape.dim_size(0); \
71 OP_REQUIRES( \
72 context, batch == out_backprop.dim_size(0), \
73 errors::InvalidArgument( \
74 label, ": input and out_backprop must have the same batch size")); \
75 const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H'); \
76 OP_REQUIRES( \
77 context, \
78 FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()), \
79 errors::InvalidArgument("Input rows too large")); \
80 const int32 input_rows = static_cast<int32>(input_rows_raw); \
81 const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W'); \
82 OP_REQUIRES( \
83 context, \
84 FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()), \
85 errors::InvalidArgument("Input cols too large")); \
86 const int32 input_cols = static_cast<int32>(input_cols_raw); \
87 const int64 filter_rows = filter_shape.dim_size(0); \
88 const int64 filter_cols = filter_shape.dim_size(1); \
89 const int64 output_rows_raw = \
90 GetTensorDim(out_backprop.shape(), data_format_, 'H'); \
91 OP_REQUIRES( \
92 context, \
93 FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()), \
94 errors::InvalidArgument("Output rows too large")); \
95 const int32 output_rows = static_cast<int32>(output_rows_raw); \
96 const int64 output_cols_raw = \
97 GetTensorDim(out_backprop.shape(), data_format_, 'W'); \
98 OP_REQUIRES( \
99 context, \
100 FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()), \
101 errors::InvalidArgument("Output cols too large")); \
102 const int32 output_cols = static_cast<int32>(output_cols_raw); \
103 const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C'); \
104 OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \
105 errors::InvalidArgument( \
106 label, ": input and filter must have the same in_depth")); \
107 const int64 depth_multiplier = filter_shape.dim_size(3); \
108 const int64 out_depth_raw = \
109 GetTensorDim(out_backprop.shape(), data_format_, 'C'); \
110 OP_REQUIRES( \
111 context, \
112 FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()), \
113 errors::InvalidArgument("Output depth too large")); \
114 const int32 out_depth = static_cast<int32>(out_depth_raw); \
115 OP_REQUIRES( \
116 context, (depth_multiplier * in_depth) == out_depth, \
117 errors::InvalidArgument( \
118 label, ": depth_multiplier * in_depth not equal to out_depth")); \
119 const auto stride = stride_; \
120 int64 out_rows = 0, out_cols = 0, pad_top = 0, pad_bottom = 0, pad_left = 0, \
121 pad_right = 0; \
122 if (padding_ == Padding::EXPLICIT) { \
123 GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H', &pad_top, \
124 &pad_bottom); \
125 GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W', &pad_left, \
126 &pad_right); \
127 } \
128 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( \
129 input_rows, filter_rows, stride_, padding_, \
130 &out_rows, &pad_top, &pad_bottom)); \
131 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( \
132 input_cols, filter_cols, stride_, padding_, \
133 &out_cols, &pad_left, &pad_right)); \
134 OP_REQUIRES( \
135 context, output_rows == out_rows, \
136 errors::InvalidArgument( \
137 label, ": Number of rows of out_backprop doesn't match computed: ", \
138 "actual = ", output_rows, ", computed = ", out_rows)); \
139 OP_REQUIRES( \
140 context, output_cols == out_cols, \
141 errors::InvalidArgument( \
142 label, ": Number of cols of out_backprop doesn't match computed: ", \
143 "actual = ", output_cols, ", computed = ", out_cols)); \
144 DepthwiseArgs args; \
145 args.batch = batch; \
146 args.in_rows = input_rows; \
147 args.in_cols = input_cols; \
148 args.in_depth = in_depth; \
149 args.filter_rows = filter_rows; \
150 args.filter_cols = filter_cols; \
151 args.depth_multiplier = depth_multiplier; \
152 args.stride = stride; \
153 args.pad_rows = pad_top; \
154 args.pad_cols = pad_left; \
155 args.out_rows = out_rows; \
156 args.out_cols = out_cols; \
157 args.out_depth = out_depth; \
158 VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", " \
159 << input_rows << ", " << input_cols << ", " << in_depth \
160 << "]; Filter: [" << filter_rows << ", " << filter_cols << ", " \
161 << in_depth << ", " << depth_multiplier << "]; stride = " << stride \
162 << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left \
163 << ", output: [" << batch << ", " << out_rows << ", " << out_cols \
164 << ", " << out_depth << "]";
165
166 // Copies data from local region in 'out_backprop' into 'buffer'.
167 // The local region coordinates are calculated as the set of output points which
168 // used the input point ('in_r', 'in_'c') as input during the forward pass.
169 // Rather than spatially reversing the filter, the input is reversed during
170 // the copy. The copied data is padded to vector register-width boundaries so
171 // that it is aligned for efficient traversal and vector multiply-add by the
172 // depthwise input kernel.
173 //
174 // EX:
175 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
176 //
177 // 'out_backprop': [batch, out_rows, out_cols, out_depth]
178 //
179 // [a00, a01, a10, a11] [a20, a21, b00, b01]
180 // [b10, b11, b20, b21] [...]
181 // [e00, e01, e10, e11] [e20, e21, f00, f01]
182 // [f10, f11, f20, f21] [...]
183 //
184 // 'buffer' (register boundaries shown):
185 //
186 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
187 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
188 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
189 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
190 //
191 template <typename T>
CopyOutputBackpropRegion(const DepthwiseArgs & args,const int64_t padded_filter_inner_dim_size,const int64_t in_r,const int64_t in_c,const T * out_backprop,T * buffer)192 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
193 const int64_t padded_filter_inner_dim_size,
194 const int64_t in_r, const int64_t in_c,
195 const T* out_backprop, T* buffer) {
196 typedef typename Eigen::internal::packet_traits<T>::type Packet;
197 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
198
199 const int64_t stride = args.stride;
200 const int64_t filter_rows = args.filter_rows;
201 const int64_t filter_cols = args.filter_cols;
202 const int64_t pad_rows = args.pad_rows;
203 const int64_t pad_cols = args.pad_cols;
204 const int64_t out_rows = args.out_rows;
205 const int64_t out_cols = args.out_cols;
206
207 // Calculate the output spatial region which used point (in_r, in_c) as input.
208 const int64_t out_r_start = std::max(
209 static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride);
210 const int64_t out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
211 const int64_t out_c_start = std::max(
212 static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride);
213 const int64_t out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
214
215 // Zero-pad 'buffer' if output region is smaller than filter spatial size.
216 const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
217 if ((out_r_end - out_r_start + 1) < args.filter_rows ||
218 (out_c_end - out_c_start + 1) < args.filter_cols) {
219 memset(buffer, 0,
220 filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
221 }
222
223 // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
224 const int64_t vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
225 const int64_t scalar_size = args.out_depth % kPacketSize;
226 const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
227
228 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
229 const int64_t f_r = in_r + pad_rows - out_r * stride;
230 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
231 const int64_t f_c = in_c + pad_cols - out_c * stride;
232 const int64_t buf_base =
233 (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
234 // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
235 auto* out_bprop =
236 out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
237
238 // Copy vectorized portion of inner dimension into 'buffer'.
239 for (int64_t d = 0; d < vectorized_size; d += kPacketSize) {
240 auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
241 Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
242 }
243 // Copy scalar portion of out_bprop to 'buffer'
244 for (int64_t d = 0; d < scalar_size; ++d) {
245 buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
246 }
247 // Pad to vector-register width (if needed).
248 for (int64_t d = 0; d < pad_size; ++d) {
249 buffer[buf_base + vectorized_size + scalar_size + d] =
250 static_cast<T>(0);
251 }
252 }
253 }
254 }
255
256 // Computes the vectorized product of 'buffer' and 'filter' and stores
257 // result in 'output' at location computed from 'in_r' and 'in_c'.
258 // If depth_multiplier is > 1, the intermediate output is reduced along
259 // the depth_multiplier dimension.
260 //
261 // EX:
262 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
263 // Both 'input_buffer' and 'filter' are padded to register-width boundaries.
264 //
265 // 'buffer' [rows, cols, in_depth, depth_multiplier]
266 //
267 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
268 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
269 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
270 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
271 //
272 // filter [rows, cols, in_depth, depth_multiplier]
273 // [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
274 // [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
275 //
276 // First output register [in_depth, depth_multiplier]
277 // [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
278 // ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
279 // ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
280 // ([a00, a01, a10, a11] x [u3, v3, w3, x3])
281 //
282 // Reduction step along depth-multiplier dimension:
283 //
284 // [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
285 //
286
287 template <typename T>
ComputeBackpropInput(const DepthwiseArgs & args,const int64_t padded_filter_inner_dim_size,const int64_t in_r,const int64_t in_c,const T * filter,const T * buffer,T * out_buffer,T * output)288 static void ComputeBackpropInput(const DepthwiseArgs& args,
289 const int64_t padded_filter_inner_dim_size,
290 const int64_t in_r, const int64_t in_c,
291 const T* filter, const T* buffer,
292 T* out_buffer, T* output) {
293 typedef typename Eigen::internal::packet_traits<T>::type Packet;
294 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
295
296 const int64_t in_depth = args.in_depth;
297 const int64_t depth_multiplier = args.depth_multiplier;
298 const int64_t out_depth = args.out_depth;
299 const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
300
301 // Calculate vectorized and scalar lengths of 'out_depth'.
302 const int64_t output_vectorized_size =
303 (out_depth / kPacketSize) * kPacketSize;
304 const int64_t output_scalar_size = out_depth % kPacketSize;
305
306 // Calculate base index at which to begin writing output.
307 const int64_t base_output_index = (in_r * args.in_cols + in_c) * in_depth;
308
309 // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
310 // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
311 const int64_t dm_vectorized_size =
312 (depth_multiplier / kPacketSize) * kPacketSize;
313 const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
314
315 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
316 // Reset accumulator.
317 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
318 for (int j = 0; j < filter_spatial_size; ++j) {
319 // Calculate index.
320 const int64_t index = i + j * padded_filter_inner_dim_size;
321 // Load filter.
322 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
323 // Load input.
324 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
325 // Vector multiply-add.
326 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
327 }
328 if (depth_multiplier == 1) {
329 // Write directly to the output.
330 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
331 } else {
332 // Buffer output for subsequent reduction step.
333 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
334 }
335 }
336
337 if (output_scalar_size > 0) {
338 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
339 for (int j = 0; j < filter_spatial_size; ++j) {
340 const int64_t index =
341 output_vectorized_size + j * padded_filter_inner_dim_size;
342 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
343 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
344 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
345 }
346 // Load accumulator into an array and loop through output.
347 T out_buf[kPacketSize];
348 Eigen::internal::pstoreu<T>(out_buf, vaccum);
349 if (depth_multiplier == 1) {
350 // Write directly to the output.
351 for (int j = 0; j < output_scalar_size; ++j) {
352 output[base_output_index + output_vectorized_size + j] = out_buf[j];
353 }
354 } else {
355 // Buffer output for subsequent reduction step.
356 for (int j = 0; j < output_scalar_size; ++j) {
357 out_buffer[output_vectorized_size + j] = out_buf[j];
358 }
359 }
360 }
361
362 // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
363 if (depth_multiplier > 1) {
364 for (int64_t d = 0; d < in_depth; ++d) {
365 const int64_t index = d * args.depth_multiplier;
366 T accum = static_cast<T>(0);
367 for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
368 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
369 accum += Eigen::internal::predux(v);
370 }
371 // Copy scalar portion of replicated output.
372 for (int64_t dm = 0; dm < dm_scalar_size; ++dm) {
373 accum += out_buffer[index + dm_vectorized_size + dm];
374 }
375 // Copy to output.
376 output[base_output_index + d] = accum;
377 }
378 }
379 }
380
381 // Computes the depthwise conv2d backprop input of 'out_backprop' by
382 // 'depthwise_filter' and stores the result in 'in_backprop'.
383 template <typename T>
384 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
385 typedef typename Eigen::internal::packet_traits<T>::type Packet;
386
operator ()tensorflow::LaunchDepthwiseConvBackpropInputOp387 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
388 const T* out_backprop, const T* depthwise_filter,
389 T* in_backprop, TensorFormat data_format) {
390 OP_REQUIRES(
391 ctx, data_format == FORMAT_NHWC,
392 errors::Unimplemented(
393 "Depthwise convolution on CPU is only supported for NHWC format"));
394
395 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
396
397 // Pad 'depthwise_filter' to vector register width (if needed).
398 const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
399 Tensor padded_filter;
400 if (pad_filter) {
401 // Allocate space for padded filter.
402 const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
403 const int64_t padded_filter_inner_dim_size =
404 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
405 OP_REQUIRES_OK(
406 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
407 TensorShape({filter_spatial_size,
408 padded_filter_inner_dim_size}),
409 &padded_filter));
410 // Write out padded filter.
411 functor::DepthwiseFilterPadOp<T>()(
412 args, depthwise_filter, padded_filter.template flat<T>().data());
413 }
414 const T* filter_data =
415 pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
416
417 // Computes one shard of depthwise conv2d backprop input.
418 auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
419 int64_t start, int64_t limit) {
420 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
421
422 const int64_t input_image_size =
423 args.in_rows * args.in_cols * args.in_depth;
424 const int64_t output_image_size =
425 args.out_rows * args.out_cols * args.out_depth;
426 const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
427 const int64_t padded_filter_inner_dim_size =
428 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
429
430 // Allocate buffer to copy regions from 'out_backprop'.
431 Tensor out_bprop_buffer;
432 OP_REQUIRES_OK(
433 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
434 TensorShape({filter_spatial_size,
435 padded_filter_inner_dim_size}),
436 &out_bprop_buffer));
437 T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
438
439 // Allocate buffer for intermediate results.
440 Tensor in_bprop_buffer;
441 OP_REQUIRES_OK(
442 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
443 TensorShape({padded_filter_inner_dim_size}),
444 &in_bprop_buffer));
445 T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
446
447 for (int64_t b = start; b < limit; ++b) {
448 for (int64_t in_r = 0; in_r < args.in_rows; ++in_r) {
449 for (int64_t in_c = 0; in_c < args.in_cols; ++in_c) {
450 // Populate 'out_bprop_buf' from local 'out_backprop' region.
451 CopyOutputBackpropRegion<T>(
452 args, padded_filter_inner_dim_size, in_r, in_c,
453 out_backprop + b * output_image_size, out_bprop_buf);
454
455 // Compute depthwise backprop input.
456 ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
457 in_c, filter_data, out_bprop_buf,
458 in_bprop_buf,
459 in_backprop + b * input_image_size);
460 }
461 }
462 }
463 };
464
465 const int64_t shard_cost = args.in_rows * args.in_cols * args.out_depth;
466 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
467 Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
468 shard_cost, shard);
469 }
470 };
471
472 template <typename T>
DepthwiseConvBackpropInputReference(const DepthwiseArgs & args,const T * out_backprop,const T * filter,T * in_backprop)473 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
474 const T* out_backprop,
475 const T* filter,
476 T* in_backprop) {
477 // Naive for loop as a reference point without concerns about performance.
478 for (int b = 0; b < args.batch; ++b) {
479 for (int in_r = 0; in_r < args.in_rows; ++in_r) {
480 for (int in_c = 0; in_c < args.in_cols; ++in_c) {
481 for (int in_d = 0; in_d < args.in_depth; ++in_d) {
482 T sum = 0;
483 const int stride = args.stride;
484 const int out_d_start = in_d * args.depth_multiplier;
485 const int out_d_end = out_d_start + args.depth_multiplier;
486
487 for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
488 const int out_r_start = std::max(
489 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
490 const int out_r_end =
491 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
492
493 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
494 const int out_c_start = std::max(
495 0,
496 (in_c - args.filter_cols + args.pad_cols + stride) / stride);
497 const int out_c_end =
498 std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
499
500 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
501 int f_r = in_r + args.pad_rows - out_r * stride;
502 int f_c = in_c + args.pad_cols - out_c * stride;
503 int filter_dm = out_d - out_d_start;
504 int out_backprop_offset =
505 out_d +
506 args.out_depth *
507 (out_c + args.out_cols * (out_r + args.out_rows * b));
508 int filter_offset =
509 filter_dm +
510 args.depth_multiplier *
511 (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
512 sum +=
513 out_backprop[out_backprop_offset] * filter[filter_offset];
514 }
515 }
516 }
517
518 int in_backprop_offset =
519 in_d +
520 args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
521 in_backprop[in_backprop_offset] = sum;
522 }
523 }
524 }
525 }
526 }
527
528 // Extern template instantiated in conv_grad_input_ops.cc.
529 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
530 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
531 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
532
533 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
534
535 // Extern template instantiated in conv_grad_input_ops.cc.
536 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
537 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
538 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
539
540 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
541 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
542 Eigen::half>;
543 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
544 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
545
546 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
547
548 // Kernel to compute the input backprop for depthwise convolution.
549 template <typename Device, class T>
550 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
551 public:
DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction * context)552 explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
553 : OpKernel(context) {
554 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
555 OP_REQUIRES(context, strides_.size() == 4,
556 errors::InvalidArgument("Sliding window strides field must "
557 "specify 4 dimensions"));
558
559 string data_format;
560 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
561 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
562 errors::InvalidArgument("Invalid data format"));
563
564 stride_ = GetTensorDim(strides_, data_format_, 'H');
565 const int64_t stride_w = GetTensorDim(strides_, data_format_, 'W');
566 const int64_t stride_n = GetTensorDim(strides_, data_format_, 'N');
567 const int64_t stride_c = GetTensorDim(strides_, data_format_, 'C');
568
569 OP_REQUIRES(context, stride_ == stride_w,
570 errors::InvalidArgument(
571 "Current implementation only supports equal length "
572 "strides in the row and column dimensions."));
573 OP_REQUIRES(
574 context, (stride_n == 1 && stride_c == 1),
575 errors::InvalidArgument("Current implementation does not yet support "
576 "strides in the batch and depth dimensions."));
577 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
578 OP_REQUIRES_OK(context,
579 context->GetAttr("explicit_paddings", &explicit_paddings_));
580 OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
581 /*num_dims=*/4, data_format_));
582
583 cudnn_use_autotune_ = CudnnUseAutotune();
584 dtype_ = DataTypeToEnum<T>::value;
585 #if CUDNN_VERSION >= 8000
586 // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
587 // NHWC depthwise kernels to support more combinations (filter
588 // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
589 // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
590 // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
591 // release-notes/rel_8.html#rel_8)
592 use_cudnn_grouped_conv_ =
593 dtype_ == DT_HALF &&
594 ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
595 (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
596 (stride_ == 1 || stride_ == 2)));
597 #elif CUDNN_VERSION >= 7603
598 // Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
599 // NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
600 // a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
601 use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
602 data_format_ == FORMAT_NCHW && stride_ == 1 &&
603 stride_w == 1;
604 #else
605 use_cudnn_grouped_conv_ = false;
606 #endif
607 }
608
Compute(OpKernelContext * context)609 void Compute(OpKernelContext* context) override {
610 const Tensor& input_sizes = context->input(0);
611 const Tensor& filter = context->input(1);
612 OP_REQUIRES(
613 context, TensorShapeUtils::IsVector(input_sizes.shape()),
614 errors::InvalidArgument(
615 "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
616 input_sizes.dims()));
617 TensorShape input_shape;
618 const int32* in_sizes_data = input_sizes.template flat<int32>().data();
619
620 for (int i = 0; i < input_sizes.NumElements(); ++i) {
621 OP_REQUIRES(context, in_sizes_data[i] >= 0,
622 errors::InvalidArgument("Dimension ", i,
623 " of input_sizes must be >= 0"));
624 input_shape.AddDim(in_sizes_data[i]);
625 }
626 const TensorShape& filter_shape = filter.shape();
627 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
628
629 Tensor* in_backprop = nullptr;
630 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
631 {0}, 0, input_shape, &in_backprop));
632
633 // If there is nothing to compute, return.
634 if (input_shape.num_elements() == 0) {
635 return;
636 }
637
638 // If in_depth==1, this operation is just a standard convolution.
639 // Depthwise convolution is a special case of cuDNN's grouped convolution.
640 bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
641 (in_depth == 1 ||
642 (use_cudnn_grouped_conv_ &&
643 IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
644 /*filter_cols=*/filter_cols,
645 /*in_depth=*/in_depth,
646 /*out_depth=*/out_depth)));
647
648 VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
649 << " Input: [" << batch << ", " << input_rows << ", " << input_cols
650 << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
651 << filter_cols << ", " << in_depth << ", " << depth_multiplier
652 << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
653 << ", " << out_depth << "], stride = " << stride_
654 << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
655 << ", Use cuDNN: " << use_cudnn;
656
657 if (use_cudnn) {
658 // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
659 //
660 // | TensorFlow | cuDNN
661 // --------------------------------------------------------------------
662 // filter_out_depth | depth_multiplier | depth_multiplier * group_count
663 // filter_in_depth | in_depth | in_depth / group_count
664 //
665 // For depthwise convolution, we have group_count == in_depth.
666 int32_t filter_in_depth = 1;
667 TensorShape shape =
668 TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
669 Tensor reshaped_filter(/*type=*/dtype_);
670 OP_REQUIRES(
671 context, reshaped_filter.CopyFrom(filter, shape),
672 errors::Internal(
673 "Failed to reshape filter tensor for grouped convolution."));
674 // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
675 // conv is supported.
676 launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
677 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
678 stride_, stride_, padding_, explicit_paddings_, in_backprop,
679 data_format_);
680 return;
681 }
682
683 auto out_backprop_ptr = out_backprop.template flat<T>().data();
684 auto filter_ptr = filter.template flat<T>().data();
685 auto in_backprop_ptr = in_backprop->template flat<T>().data();
686 LaunchDepthwiseConvBackpropInputOp<Device, T>()(
687 context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
688 data_format_);
689 }
690
691 protected:
692 bool use_cudnn_grouped_conv_;
693
694 private:
695 std::vector<int32> strides_;
696 Padding padding_;
697 std::vector<int64> explicit_paddings_;
698 TensorFormat data_format_;
699 int64 stride_;
700
701 // For in_depth == 1 and grouped convolutions.
702 LaunchConv2DBackpropInputOp<Device, T> launcher_;
703 bool cudnn_use_autotune_;
704 DataType dtype_;
705
706 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
707 };
708
709 #define REGISTER_CPU_KERNEL(T) \
710 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
711 .Device(DEVICE_CPU) \
712 .TypeConstraint<T>("T"), \
713 DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
714
715 TF_CALL_half(REGISTER_CPU_KERNEL);
716 TF_CALL_float(REGISTER_CPU_KERNEL);
717 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
718 TF_CALL_double(REGISTER_CPU_KERNEL);
719 #endif
720 #undef REGISTER_CPU_KERNEL
721
722 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
723
724 #define REGISTER_GPU_KERNEL(T) \
725 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
726 .Device(DEVICE_GPU) \
727 .TypeConstraint<T>("T") \
728 .HostMemory("input_sizes"), \
729 DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
730
731 TF_CALL_half(REGISTER_GPU_KERNEL);
732 TF_CALL_float(REGISTER_GPU_KERNEL);
733 TF_CALL_double(REGISTER_GPU_KERNEL);
734 #undef REGISTER_GPU_KERNEL
735
736 #if CUDNN_VERSION >= 7000
737 template <typename T>
738 class DepthwiseConv2dGroupedConvBackpropInputOp
739 : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
740 public:
DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction * context)741 DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
742 : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
743 this->use_cudnn_grouped_conv_ = true;
744 }
745 };
746
747 #define REGISTER_GROUPED_CONV_KERNEL(T) \
748 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
749 .Device(DEVICE_GPU) \
750 .TypeConstraint<T>("T") \
751 .HostMemory("input_sizes") \
752 .Label("cudnn_grouped_convolution"), \
753 DepthwiseConv2dGroupedConvBackpropInputOp<T>)
754
755 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
756 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
757 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
758 #undef REGISTER_GROUPED_CONV_KERNEL
759 #endif // CUDNN_VERSION
760 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
761
762 // Kernels to compute the gradients of the filters for depthwise convolution.
763
764 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
765 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
766 //
767 // EX:
768 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
769 // Both 'input_buffer' and 'filter' are padded to register-width boundaries.
770 //
771 // 'input_buffer' [rows, cols, in_depth, depth_multiplier]
772 //
773 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
774 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
775 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
776 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
777 //
778 // 'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
779 //
780 // [q00, q01, q10, q11] [q20, q21, r00, r01]
781 // [r10, r11, r20, r21] [s00, s01, s10, s11]
782 // [s20, s21, t00, t01] [t10, t11, t20, a21]
783 //
784 // First output register of 'filter_backprop'
785 // [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
786 //
787 template <typename T>
ComputeBackpropFilter(const DepthwiseArgs & args,const int64_t padded_out_depth_size,const int64_t out_r,const int64_t out_c,const T * out_backprop,const T * input_buffer,T * output_buffer)788 static void ComputeBackpropFilter(const DepthwiseArgs& args,
789 const int64_t padded_out_depth_size,
790 const int64_t out_r, const int64_t out_c,
791 const T* out_backprop, const T* input_buffer,
792 T* output_buffer) {
793 typedef typename Eigen::internal::packet_traits<T>::type Packet;
794 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
795 // Calculate vectorized size of 'padded_out_depth_size'.
796 const int64_t out_depth = args.out_depth;
797 const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
798 const int64_t output_vectorized_size =
799 (padded_out_depth_size / kPacketSize) * kPacketSize;
800 const int64_t base_output_index = (out_r * args.out_cols + out_c) * out_depth;
801 // Determine whether we can execute fast or slow code path.
802 const int64_t output_image_size =
803 args.out_rows * args.out_cols * args.out_depth;
804 const int64_t output_last_vector_index =
805 output_image_size - (filter_spatial_size * padded_out_depth_size);
806 const bool fast_path = base_output_index <= output_last_vector_index;
807
808 if (fast_path) {
809 // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
810 // amortize the cost of 'output_buffer' load store in the loop below.
811 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
812 // Load vector register from 'out_backprop'.
813 const auto out_bprop_block =
814 Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
815 for (int j = 0; j < filter_spatial_size; ++j) {
816 const int64_t index = i + j * padded_out_depth_size;
817 // Load vector register from 'input_buffer'.
818 const auto input_block =
819 Eigen::internal::ploadu<Packet>(input_buffer + index);
820 // Load output block into vector register.
821 auto out_block_data = output_buffer + index;
822 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
823 // Vector multiply-add.
824 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
825 out_block);
826 // Store 'out_block' back to memory.
827 Eigen::internal::pstoreu<T>(out_block_data, out_block);
828 }
829 }
830 } else {
831 // Slow path (cant do vector reads from non-padded 'out_backprop'.
832 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
833 // Calculate safe read size from 'out_backprop'.
834 const int64_t out_bprop_index = base_output_index + i;
835 const int64_t out_bprop_limit =
836 std::min(output_image_size, out_bprop_index + kPacketSize);
837 T out_buf[kPacketSize];
838 memset(&out_buf, 0, kPacketSize * sizeof(T));
839 const int64_t scalar_size = out_bprop_limit - out_bprop_index;
840 for (int64_t j = 0; j < scalar_size; ++j) {
841 out_buf[j] = out_backprop[out_bprop_index + j];
842 }
843 // Load vector register from 'out_buf'.
844 const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
845 for (int j = 0; j < filter_spatial_size; ++j) {
846 const int64_t index = i + j * padded_out_depth_size;
847 // Load vector register from 'input_buffer'.
848 const auto input_block =
849 Eigen::internal::ploadu<Packet>(input_buffer + index);
850 // Load output block into vector register.
851 auto out_block_data = output_buffer + index;
852 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
853 // Vector multiply-add.
854 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
855 out_block);
856 // Store 'out_block' back to memory.
857 Eigen::internal::pstoreu<T>(out_block_data, out_block);
858 }
859 }
860 }
861 }
862
863 template <typename Device, typename T>
864 struct LaunchDepthwiseConvBackpropFilterOp;
865
866 template <typename T>
867 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
868 typedef typename Eigen::internal::packet_traits<T>::type Packet;
869
operator ()tensorflow::LaunchDepthwiseConvBackpropFilterOp870 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
871 const T* out_backprop, const T* input, T* filter_backprop,
872 TensorFormat data_format) {
873 OP_REQUIRES(
874 ctx, data_format == FORMAT_NHWC,
875 errors::Unimplemented(
876 "Depthwise convolution on CPU is only supported for NHWC format"));
877
878 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
879
880 const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
881 const int64_t padded_out_depth_size =
882 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
883
884 // Allocate output buffers for each image in 'batch' (padded to vector
885 // register boundaries).
886 Tensor output_buffer;
887 OP_REQUIRES_OK(
888 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
889 TensorShape({args.batch, filter_spatial_size,
890 padded_out_depth_size}),
891 &output_buffer));
892 T* output_buffer_data = output_buffer.template flat<T>().data();
893
894 // Computes one shard of depthwise conv2d backprop filter.
895 auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
896 int64_t start, int64_t limit) {
897 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
898 const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
899 const int64_t padded_out_depth_size =
900 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
901
902 // Allocate buffer for local input regions.
903 Tensor input_buffer;
904 OP_REQUIRES_OK(
905 ctx, ctx->allocate_temp(
906 DataTypeToEnum<T>::value,
907 TensorShape({filter_spatial_size, padded_out_depth_size}),
908 &input_buffer));
909 T* input_buffer_data = input_buffer.template flat<T>().data();
910
911 const int64_t input_image_size =
912 args.in_rows * args.in_cols * args.in_depth;
913 const int64_t output_image_size =
914 args.out_rows * args.out_cols * args.out_depth;
915 const int64_t padded_filter_size =
916 filter_spatial_size * padded_out_depth_size;
917
918 for (int b = start; b < limit; ++b) {
919 // Initialize 'output_buffer' for 'b'.
920 auto* output_buffer = output_buffer_data + b * padded_filter_size;
921 memset(output_buffer, 0, padded_filter_size * sizeof(T));
922
923 for (int out_r = 0; out_r < args.out_rows; ++out_r) {
924 for (int out_c = 0; out_c < args.out_cols; ++out_c) {
925 // Populate 'input_buffer_data' with data from local input region.
926 functor::DepthwiseInputCopyOp<T>()(
927 args, padded_out_depth_size, out_r, out_c,
928 input + b * input_image_size, input_buffer_data);
929 // Compute depthwise backprop filter.
930 ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
931 out_backprop + b * output_image_size,
932 input_buffer_data, output_buffer);
933 }
934 }
935 }
936 };
937 const int64_t shard_cost = args.out_rows * args.out_cols * args.out_depth;
938 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
939 Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
940 shard_cost, shard);
941
942 // Accumulate 'output_buffer' from each shard into 'output'.
943 const int64_t out_depth = args.out_depth;
944 const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
945 const int64_t scalar_size = out_depth - vectorized_size;
946 const int64_t padded_filter_size =
947 filter_spatial_size * padded_out_depth_size;
948 memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
949
950 for (int64_t i = 0; i < filter_spatial_size; ++i) {
951 const int64_t buffer_base = i * padded_out_depth_size;
952 const int64_t output_base = i * out_depth;
953 // Write vectorized length of filter's inner dimension to output.
954 for (int64_t j = 0; j < vectorized_size; j += kPacketSize) {
955 // Load data from 'filter_backprop' into vector register.
956 auto out_block_data = filter_backprop + output_base + j;
957 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
958 for (int b = 0; b < args.batch; ++b) {
959 // Load data from 'output_buffer' for 'b'.
960 const auto* output_buffer =
961 output_buffer_data + b * padded_filter_size;
962 const auto v =
963 Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
964 // Add 'v' to 'out_block'.
965 out_block = Eigen::internal::padd<Packet>(out_block, v);
966 }
967 // Store 'out_block' back to memory.
968 Eigen::internal::pstoreu<T>(out_block_data, out_block);
969 }
970 // Write scalar length of filter's inner dimension to output.
971 for (int64_t j = 0; j < scalar_size; ++j) {
972 for (int b = 0; b < args.batch; ++b) {
973 const auto* output_buffer =
974 output_buffer_data + b * padded_filter_size;
975 filter_backprop[output_base + vectorized_size + j] +=
976 output_buffer[buffer_base + vectorized_size + j];
977 }
978 }
979 }
980 }
981 };
982
983 template <typename T>
DepthwiseConvBackpropFilterReference(const DepthwiseArgs & args,const T * out_backprop,const T * input,T * filter_backprop)984 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
985 const T* out_backprop,
986 const T* input,
987 T* filter_backprop) {
988 int num_filter_backprop = args.filter_rows * args.filter_cols *
989 args.in_depth * args.depth_multiplier;
990 memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
991 // Naive for loop as a reference point without concerns about performance.
992 for (int b = 0; b < args.batch; ++b) {
993 for (int out_r = 0; out_r < args.out_rows; ++out_r) {
994 for (int out_c = 0; out_c < args.out_cols; ++out_c) {
995 for (int out_d = 0; out_d < args.out_depth; ++out_d) {
996 const int in_d = out_d / args.depth_multiplier;
997 const int dm = out_d % args.depth_multiplier;
998 const int in_r_start = out_r * args.stride - args.pad_rows;
999 const int in_c_start = out_c * args.stride - args.pad_cols;
1000
1001 for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
1002 for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
1003 const int in_r = in_r_start + f_r;
1004 const int in_c = in_c_start + f_c;
1005
1006 if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
1007 in_c < args.in_cols) {
1008 int out_backprop_offset =
1009 out_d +
1010 args.out_depth *
1011 (out_c + args.out_cols * (out_r + args.out_rows * b));
1012 int input_offset =
1013 in_d +
1014 args.in_depth *
1015 (in_c + args.in_cols * (in_r + args.in_rows * b));
1016 int filter_backprop_offset =
1017 dm +
1018 args.depth_multiplier *
1019 (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
1020 filter_backprop[filter_backprop_offset] +=
1021 input[input_offset] * out_backprop[out_backprop_offset];
1022 }
1023 }
1024 }
1025 }
1026 }
1027 }
1028 }
1029 }
1030
1031 // Extern template instantiated in conv_grad_filter_ops.cc.
1032 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
1033 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
1034 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
1035
1036 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1037
1038 // Extern template instantiated in conv_grad_filter_ops.cc.
1039 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
1040 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
1041 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1042
1043 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1044 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1045 Eigen::half>;
1046 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1047 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1048
1049 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1050
1051 // Kernel to compute the filter backprop for depthwise convolution.
1052 template <typename Device, class T>
1053 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1054 public:
DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction * context)1055 explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1056 : OpKernel(context) {
1057 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1058 OP_REQUIRES(context, strides_.size() == 4,
1059 errors::InvalidArgument("Sliding window strides field must "
1060 "specify 4 dimensions"));
1061
1062 string data_format;
1063 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1064 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1065 errors::InvalidArgument("Invalid data format"));
1066
1067 stride_ = GetTensorDim(strides_, data_format_, 'H');
1068 const int64_t stride_w = GetTensorDim(strides_, data_format_, 'W');
1069 const int64_t stride_n = GetTensorDim(strides_, data_format_, 'N');
1070 const int64_t stride_c = GetTensorDim(strides_, data_format_, 'C');
1071
1072 OP_REQUIRES(context, stride_ == stride_w,
1073 errors::InvalidArgument(
1074 "Current implementation only supports equal length "
1075 "strides in the row and column dimensions."));
1076 OP_REQUIRES(
1077 context, (stride_n == 1 && stride_c == 1),
1078 errors::InvalidArgument("Current implementation does not yet support "
1079 "strides in the batch and depth dimensions."));
1080 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1081 OP_REQUIRES_OK(context,
1082 context->GetAttr("explicit_paddings", &explicit_paddings_));
1083 OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
1084 /*num_dims=*/4, data_format_));
1085
1086 cudnn_use_autotune_ = CudnnUseAutotune();
1087
1088 if (std::is_same<T, Eigen::half>::value) {
1089 dtype_ = DT_HALF;
1090 } else if (std::is_same<T, float>::value) {
1091 dtype_ = DT_FLOAT;
1092 } else if (std::is_same<T, double>::value) {
1093 dtype_ = DT_DOUBLE;
1094 } else {
1095 LOG(ERROR) << "Only half, float, and double are supported.";
1096 }
1097 // Use CuDNN grouped conv (filter gradients) when input/output is
1098 // float16(half). See cudnn release note 7.6.3. (https://docs.nvidia.com/dee
1099 // plearning/sdk/cudnn-release-notes/rel_763.html#rel_763)
1100 #if CUDNN_VERSION >= 7603
1101 use_cudnn_grouped_conv_ = dtype_ == DT_HALF;
1102 #else
1103 use_cudnn_grouped_conv_ = false;
1104 #endif
1105 }
1106
Compute(OpKernelContext * context)1107 void Compute(OpKernelContext* context) override {
1108 const Tensor& input = context->input(0);
1109 const Tensor& filter_sizes = context->input(1);
1110 OP_REQUIRES(
1111 context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1112 errors::InvalidArgument(
1113 "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1114 filter_sizes.dims()));
1115 TensorShape filter_shape;
1116 const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1117 for (int i = 0; i < filter_sizes.NumElements(); ++i) {
1118 OP_REQUIRES(context, filter_sizes_data[i] >= 0,
1119 errors::InvalidArgument("Dimension ", i,
1120 " of filter_sizes must be >= 0"));
1121 filter_shape.AddDim(filter_sizes_data[i]);
1122 }
1123 const TensorShape& input_shape = input.shape();
1124
1125 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1126 Tensor* filter_backprop = nullptr;
1127 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1128 {1}, 0, filter_shape, &filter_backprop));
1129
1130 // If there is nothing to compute, return.
1131 if (out_backprop.shape().num_elements() == 0) {
1132 return;
1133 }
1134
1135 // If in_depth==1, this operation is just a standard convolution.
1136 // Depthwise convolution is a special case of cuDNN's grouped convolution.
1137 bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
1138 (in_depth == 1 ||
1139 (use_cudnn_grouped_conv_ &&
1140 IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
1141 /*filter_cols=*/filter_cols,
1142 /*in_depth=*/in_depth,
1143 /*out_depth=*/out_depth)));
1144
1145 VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
1146 << " Input: [" << batch << ", " << input_rows << ", " << input_cols
1147 << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1148 << filter_cols << ", " << in_depth << ", " << depth_multiplier
1149 << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1150 << ", " << out_depth << "], stride = " << stride_
1151 << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
1152 << ", Use cuDNN: " << use_cudnn;
1153
1154 if (use_cudnn) {
1155 // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1156 //
1157 // | TensorFlow | cuDNN
1158 // --------------------------------------------------------------------
1159 // filter_out_depth | depth_multiplier | depth_multiplier * group_count
1160 // filter_in_depth | in_depth | in_depth / group_count
1161 //
1162 // For depthwise convolution, we have group_count == in_depth.
1163 int32_t filter_in_depth = 1;
1164 TensorShape shape =
1165 TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
1166 Tensor reshaped_filter(/*type=*/dtype_);
1167 OP_REQUIRES(
1168 context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1169 errors::Internal(
1170 "Failed to reshape filter tensor for grouped convolution."));
1171
1172 // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1173 // conv is supported.
1174 launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
1175 input,
1176 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
1177 padding_, explicit_paddings_, &reshaped_filter, data_format_);
1178 return;
1179 }
1180
1181 // For GPU inputs with type half, we cast inputs to float and outputs back
1182 // to half, as half implementation is slow and does not use full precision
1183 // accumulation in some cases.
1184 constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
1185 std::is_same<Device, GPUDevice>::value;
1186 using U = typename std::conditional<cast_to_float, float, T>::type;
1187 Tensor casted_out_backprop = out_backprop;
1188 Tensor casted_input = input;
1189 Tensor casted_filter_backprop = *filter_backprop;
1190 const Device& device = context->template eigen_device<Device>();
1191 if (cast_to_float) {
1192 functor::CastFunctor<Device, float, Eigen::half> cast;
1193 OP_REQUIRES_OK(context,
1194 context->allocate_temp(DT_FLOAT, out_backprop.shape(),
1195 &casted_out_backprop));
1196 cast(device, casted_out_backprop.template flat<float>(),
1197 out_backprop.template flat<Eigen::half>());
1198 OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
1199 &casted_input));
1200 cast(device, casted_input.template flat<float>(),
1201 input.template flat<Eigen::half>());
1202 OP_REQUIRES_OK(context,
1203 context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
1204 &casted_filter_backprop));
1205 }
1206
1207 auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
1208 auto input_ptr = casted_input.template flat<U>().data();
1209 auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
1210 LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
1211 context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1212 data_format_);
1213
1214 if (cast_to_float) {
1215 functor::CastFunctor<Device, Eigen::half, float> cast;
1216 const Tensor& casted_filter_backprop_const = casted_filter_backprop;
1217 cast(device, filter_backprop->template flat<Eigen::half>(),
1218 casted_filter_backprop_const.template flat<float>());
1219 }
1220 }
1221
1222 protected:
1223 bool use_cudnn_grouped_conv_;
1224
1225 private:
1226 std::vector<int32> strides_;
1227 Padding padding_;
1228 std::vector<int64> explicit_paddings_;
1229 TensorFormat data_format_;
1230 int64 stride_;
1231
1232 // For in_depth == 1 and grouped convolutions.
1233 LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1234 bool cudnn_use_autotune_;
1235 DataType dtype_;
1236
1237 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1238 };
1239
1240 #define REGISTER_CPU_KERNEL(T) \
1241 REGISTER_KERNEL_BUILDER( \
1242 Name("DepthwiseConv2dNativeBackpropFilter") \
1243 .Device(DEVICE_CPU) \
1244 .TypeConstraint<T>("T"), \
1245 DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1246 TF_CALL_half(REGISTER_CPU_KERNEL);
1247 TF_CALL_float(REGISTER_CPU_KERNEL);
1248 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
1249 TF_CALL_double(REGISTER_CPU_KERNEL);
1250 #endif
1251 #undef REGISTER_CPU_KERNEL
1252
1253 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1254 #define REGISTER_GPU_KERNEL(T) \
1255 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1256 .Device(DEVICE_GPU) \
1257 .TypeConstraint<T>("T") \
1258 .HostMemory("filter_sizes"), \
1259 DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1260
1261 TF_CALL_half(REGISTER_GPU_KERNEL);
1262 TF_CALL_float(REGISTER_GPU_KERNEL);
1263 TF_CALL_double(REGISTER_GPU_KERNEL);
1264 #undef REGISTER_GPU_KERNEL
1265
1266 #if CUDNN_VERSION >= 7000
1267 template <typename T>
1268 class DepthwiseConv2dGroupedConvBackpropFilterOp
1269 : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1270 public:
DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction * context)1271 DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1272 : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1273 this->use_cudnn_grouped_conv_ = true;
1274 }
1275 };
1276
1277 #define REGISTER_GROUPED_CONV_KERNEL(T) \
1278 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1279 .Device(DEVICE_GPU) \
1280 .TypeConstraint<T>("T") \
1281 .HostMemory("filter_sizes") \
1282 .Label("cudnn_grouped_convolution"), \
1283 DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1284
1285 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1286 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1287 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1288 #undef REGISTER_GROUPED_CONV_KERNEL
1289 #endif // CUDNN_VERSION
1290 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1291
1292 } // namespace tensorflow
1293