1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #define EIGEN_USE_THREADS
17
18 #include <algorithm>
19 #include <cmath>
20
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/kernel_shape_util.h"
23 #include "tensorflow/core/framework/numeric_op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_types.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/kernels/cast_op.h"
31 #include "tensorflow/core/kernels/conv_grad_ops.h"
32 #include "tensorflow/core/kernels/depthwise_conv_op.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/platform/logging.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/padding.h"
37 #include "tensorflow/core/util/tensor_format.h"
38 #include "tensorflow/core/util/use_cudnn.h"
39 #include "tensorflow/core/util/work_sharder.h"
40
41 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
42
43 #if GOOGLE_CUDA
44 #include "third_party/gpus/cudnn/cudnn.h"
45 #endif
46
47 #include "tensorflow/core/platform/stream_executor.h"
48 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
49
50 namespace tensorflow {
51
52 // Gradient operations for depthwise convolution.
53
54 typedef Eigen::ThreadPoolDevice CPUDevice;
55 typedef Eigen::GpuDevice GPUDevice;
56
57 // Common code between the two backward pass kernels: verifies that the
58 // dimensions all match and extract the padded rows and columns.
59 #define EXTRACT_AND_VERIFY_DIMENSIONS(label) \
60 const Tensor& out_backprop = context->input(2); \
61 OP_REQUIRES( \
62 context, input_shape.dims() == 4, \
63 errors::InvalidArgument(label, ": input must be 4-dimensional")); \
64 OP_REQUIRES( \
65 context, filter_shape.dims() == 4, \
66 errors::InvalidArgument(label, ": filter must be 4-dimensional")); \
67 OP_REQUIRES( \
68 context, out_backprop.dims() == 4, \
69 errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
70 const int64 batch = input_shape.dim_size(0); \
71 OP_REQUIRES( \
72 context, batch == out_backprop.dim_size(0), \
73 errors::InvalidArgument( \
74 label, ": input and out_backprop must have the same batch size")); \
75 const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H'); \
76 OP_REQUIRES( \
77 context, \
78 FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()), \
79 errors::InvalidArgument("Input rows too large")); \
80 const int32 input_rows = static_cast<int32>(input_rows_raw); \
81 const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W'); \
82 OP_REQUIRES( \
83 context, \
84 FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()), \
85 errors::InvalidArgument("Input cols too large")); \
86 const int32 input_cols = static_cast<int32>(input_cols_raw); \
87 const int64 filter_rows = filter_shape.dim_size(0); \
88 const int64 filter_cols = filter_shape.dim_size(1); \
89 const int64 output_rows_raw = \
90 GetTensorDim(out_backprop.shape(), data_format_, 'H'); \
91 OP_REQUIRES( \
92 context, \
93 FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()), \
94 errors::InvalidArgument("Output rows too large")); \
95 const int32 output_rows = static_cast<int32>(output_rows_raw); \
96 const int64 output_cols_raw = \
97 GetTensorDim(out_backprop.shape(), data_format_, 'W'); \
98 OP_REQUIRES( \
99 context, \
100 FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()), \
101 errors::InvalidArgument("Output cols too large")); \
102 const int32 output_cols = static_cast<int32>(output_cols_raw); \
103 const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C'); \
104 OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \
105 errors::InvalidArgument( \
106 label, ": input and filter must have the same in_depth")); \
107 const int64 depth_multiplier = filter_shape.dim_size(3); \
108 const int64 out_depth_raw = \
109 GetTensorDim(out_backprop.shape(), data_format_, 'C'); \
110 OP_REQUIRES( \
111 context, \
112 FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()), \
113 errors::InvalidArgument("Output depth too large")); \
114 const int32 out_depth = static_cast<int32>(out_depth_raw); \
115 OP_REQUIRES( \
116 context, (depth_multiplier * in_depth) == out_depth, \
117 errors::InvalidArgument( \
118 label, ": depth_multiplier * in_depth not equal to out_depth")); \
119 const auto stride = stride_; \
120 int64 out_rows = 0, out_cols = 0, pad_top = 0, pad_bottom = 0, pad_left = 0, \
121 pad_right = 0; \
122 if (padding_ == Padding::EXPLICIT) { \
123 GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H', &pad_top, \
124 &pad_bottom); \
125 GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W', &pad_left, \
126 &pad_right); \
127 } \
128 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( \
129 input_rows, filter_rows, stride_, padding_, \
130 &out_rows, &pad_top, &pad_bottom)); \
131 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( \
132 input_cols, filter_cols, stride_, padding_, \
133 &out_cols, &pad_left, &pad_right)); \
134 OP_REQUIRES( \
135 context, output_rows == out_rows, \
136 errors::InvalidArgument( \
137 label, ": Number of rows of out_backprop doesn't match computed: ", \
138 "actual = ", output_rows, ", computed = ", out_rows)); \
139 OP_REQUIRES( \
140 context, output_cols == out_cols, \
141 errors::InvalidArgument( \
142 label, ": Number of cols of out_backprop doesn't match computed: ", \
143 "actual = ", output_cols, ", computed = ", out_cols)); \
144 DepthwiseArgs args; \
145 args.batch = batch; \
146 args.in_rows = input_rows; \
147 args.in_cols = input_cols; \
148 args.in_depth = in_depth; \
149 args.filter_rows = filter_rows; \
150 args.filter_cols = filter_cols; \
151 args.depth_multiplier = depth_multiplier; \
152 args.stride = stride; \
153 args.pad_rows = pad_top; \
154 args.pad_cols = pad_left; \
155 args.out_rows = out_rows; \
156 args.out_cols = out_cols; \
157 args.out_depth = out_depth; \
158 VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", " \
159 << input_rows << ", " << input_cols << ", " << in_depth \
160 << "]; Filter: [" << filter_rows << ", " << filter_cols << ", " \
161 << in_depth << ", " << depth_multiplier << "]; stride = " << stride \
162 << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left \
163 << ", output: [" << batch << ", " << out_rows << ", " << out_cols \
164 << ", " << out_depth << "]";
165
166 // Copies data from local region in 'out_backprop' into 'buffer'.
167 // The local region coordinates are calculated as the set of output points which
168 // used the input point ('in_r', 'in_'c') as input during the forward pass.
169 // Rather than spatially reversing the filter, the input is reversed during
170 // the copy. The copied data is padded to vector register-width boundaries so
171 // that it is aligned for efficient traversal and vector multiply-add by the
172 // depthwise input kernel.
173 //
174 // EX:
175 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
176 //
177 // 'out_backprop': [batch, out_rows, out_cols, out_depth]
178 //
179 // [a00, a01, a10, a11] [a20, a21, b00, b01]
180 // [b10, b11, b20, b21] [...]
181 // [e00, e01, e10, e11] [e20, e21, f00, f01]
182 // [f10, f11, f20, f21] [...]
183 //
184 // 'buffer' (register boundaries shown):
185 //
186 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
187 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
188 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
189 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
190 //
191 template <typename T>
CopyOutputBackpropRegion(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * out_backprop,T * buffer)192 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
193 const int64 padded_filter_inner_dim_size,
194 const int64 in_r, const int64 in_c,
195 const T* out_backprop, T* buffer) {
196 typedef typename Eigen::internal::packet_traits<T>::type Packet;
197 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
198
199 const int64 stride = args.stride;
200 const int64 filter_rows = args.filter_rows;
201 const int64 filter_cols = args.filter_cols;
202 const int64 pad_rows = args.pad_rows;
203 const int64 pad_cols = args.pad_cols;
204 const int64 out_rows = args.out_rows;
205 const int64 out_cols = args.out_cols;
206
207 // Calculate the output spatial region which used point (in_r, in_c) as input.
208 const int64 out_r_start = std::max(
209 static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride);
210 const int64 out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
211 const int64 out_c_start = std::max(
212 static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride);
213 const int64 out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
214
215 // Zero-pad 'buffer' if output region is smaller than filter spatial size.
216 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
217 if ((out_r_end - out_r_start + 1) < args.filter_rows ||
218 (out_c_end - out_c_start + 1) < args.filter_cols) {
219 memset(buffer, 0,
220 filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
221 }
222
223 // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
224 const int64 vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
225 const int64 scalar_size = args.out_depth % kPacketSize;
226 const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
227
228 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
229 const int64 f_r = in_r + pad_rows - out_r * stride;
230 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
231 const int64 f_c = in_c + pad_cols - out_c * stride;
232 const int64 buf_base =
233 (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
234 // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
235 auto* out_bprop =
236 out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
237
238 // Copy vectorized portion of inner dimension into 'buffer'.
239 for (int64 d = 0; d < vectorized_size; d += kPacketSize) {
240 auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
241 Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
242 }
243 // Copy scalar portion of out_bprop to 'buffer'
244 for (int64 d = 0; d < scalar_size; ++d) {
245 buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
246 }
247 // Pad to vector-register width (if needed).
248 for (int64 d = 0; d < pad_size; ++d) {
249 buffer[buf_base + vectorized_size + scalar_size + d] =
250 static_cast<T>(0);
251 }
252 }
253 }
254 }
255
256 // Computes the vectorized product of 'buffer' and 'filter' and stores
257 // result in 'output' at location computed from 'in_r' and 'in_c'.
258 // If depth_multiplier is > 1, the intermediate output is reduced along
259 // the depth_multiplier dimension.
260 //
261 // EX:
262 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
263 // Both 'input_buffer' and 'filter' are padded to register-width boundaries.
264 //
265 // 'buffer' [rows, cols, in_depth, depth_multiplier]
266 //
267 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
268 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
269 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
270 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
271 //
272 // filter [rows, cols, in_depth, depth_multiplier]
273 // [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
274 // [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
275 //
276 // First output register [in_depth, depth_multiplier]
277 // [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
278 // ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
279 // ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
280 // ([a00, a01, a10, a11] x [u3, v3, w3, x3])
281 //
282 // Reduction step along depth-multiplier dimension:
283 //
284 // [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
285 //
286
287 template <typename T>
ComputeBackpropInput(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * filter,const T * buffer,T * out_buffer,T * output)288 static void ComputeBackpropInput(const DepthwiseArgs& args,
289 const int64 padded_filter_inner_dim_size,
290 const int64 in_r, const int64 in_c,
291 const T* filter, const T* buffer,
292 T* out_buffer, T* output) {
293 typedef typename Eigen::internal::packet_traits<T>::type Packet;
294 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
295
296 const int64 in_depth = args.in_depth;
297 const int64 depth_multiplier = args.depth_multiplier;
298 const int64 out_depth = args.out_depth;
299 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
300
301 // Calculate vectorized and scalar lengths of 'out_depth'.
302 const int64 output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
303 const int64 output_scalar_size = out_depth % kPacketSize;
304
305 // Calculate base index at which to begin writing output.
306 const int64 base_output_index = (in_r * args.in_cols + in_c) * in_depth;
307
308 // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
309 // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
310 const int64 dm_vectorized_size =
311 (depth_multiplier / kPacketSize) * kPacketSize;
312 const int64 dm_scalar_size = depth_multiplier % kPacketSize;
313
314 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
315 // Reset accumulator.
316 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
317 for (int j = 0; j < filter_spatial_size; ++j) {
318 // Calculate index.
319 const int64 index = i + j * padded_filter_inner_dim_size;
320 // Load filter.
321 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
322 // Load input.
323 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
324 // Vector multiply-add.
325 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
326 }
327 if (depth_multiplier == 1) {
328 // Write directly to the output.
329 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
330 } else {
331 // Buffer output for subsequent reduction step.
332 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
333 }
334 }
335
336 if (output_scalar_size > 0) {
337 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
338 for (int j = 0; j < filter_spatial_size; ++j) {
339 const int64 index =
340 output_vectorized_size + j * padded_filter_inner_dim_size;
341 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
342 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
343 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
344 }
345 // Load accumulator into an array and loop through output.
346 T out_buf[kPacketSize];
347 Eigen::internal::pstoreu<T>(out_buf, vaccum);
348 if (depth_multiplier == 1) {
349 // Write directly to the output.
350 for (int j = 0; j < output_scalar_size; ++j) {
351 output[base_output_index + output_vectorized_size + j] = out_buf[j];
352 }
353 } else {
354 // Buffer output for subsequent reduction step.
355 for (int j = 0; j < output_scalar_size; ++j) {
356 out_buffer[output_vectorized_size + j] = out_buf[j];
357 }
358 }
359 }
360
361 // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
362 if (depth_multiplier > 1) {
363 for (int64 d = 0; d < in_depth; ++d) {
364 const int64 index = d * args.depth_multiplier;
365 T accum = static_cast<T>(0);
366 for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
367 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
368 accum += Eigen::internal::predux(v);
369 }
370 // Copy scalar portion of replicated output.
371 for (int64 dm = 0; dm < dm_scalar_size; ++dm) {
372 accum += out_buffer[index + dm_vectorized_size + dm];
373 }
374 // Copy to output.
375 output[base_output_index + d] = accum;
376 }
377 }
378 }
379
380 // Computes the depthwise conv2d backprop input of 'out_backprop' by
381 // 'depthwise_filter' and stores the result in 'in_backprop'.
382 template <typename T>
383 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
384 typedef typename Eigen::internal::packet_traits<T>::type Packet;
385
operator ()tensorflow::LaunchDepthwiseConvBackpropInputOp386 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
387 const T* out_backprop, const T* depthwise_filter,
388 T* in_backprop, TensorFormat data_format) {
389 OP_REQUIRES(
390 ctx, data_format == FORMAT_NHWC,
391 errors::Unimplemented(
392 "Depthwise convolution on CPU is only supported for NHWC format"));
393
394 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
395
396 // Pad 'depthwise_filter' to vector register width (if needed).
397 const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
398 Tensor padded_filter;
399 if (pad_filter) {
400 // Allocate space for padded filter.
401 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
402 const int64 padded_filter_inner_dim_size =
403 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
404 OP_REQUIRES_OK(
405 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
406 TensorShape({filter_spatial_size,
407 padded_filter_inner_dim_size}),
408 &padded_filter));
409 // Write out padded filter.
410 functor::DepthwiseFilterPadOp<T>()(
411 args, depthwise_filter, padded_filter.template flat<T>().data());
412 }
413 const T* filter_data =
414 pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
415
416 // Computes one shard of depthwise conv2d backprop input.
417 auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
418 int64 start, int64 limit) {
419 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
420
421 const int64 input_image_size =
422 args.in_rows * args.in_cols * args.in_depth;
423 const int64 output_image_size =
424 args.out_rows * args.out_cols * args.out_depth;
425 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
426 const int64 padded_filter_inner_dim_size =
427 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
428
429 // Allocate buffer to copy regions from 'out_backprop'.
430 Tensor out_bprop_buffer;
431 OP_REQUIRES_OK(
432 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
433 TensorShape({filter_spatial_size,
434 padded_filter_inner_dim_size}),
435 &out_bprop_buffer));
436 T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
437
438 // Allocate buffer for intermediate results.
439 Tensor in_bprop_buffer;
440 OP_REQUIRES_OK(
441 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
442 TensorShape({padded_filter_inner_dim_size}),
443 &in_bprop_buffer));
444 T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
445
446 for (int64 b = start; b < limit; ++b) {
447 for (int64 in_r = 0; in_r < args.in_rows; ++in_r) {
448 for (int64 in_c = 0; in_c < args.in_cols; ++in_c) {
449 // Populate 'out_bprop_buf' from local 'out_backprop' region.
450 CopyOutputBackpropRegion<T>(
451 args, padded_filter_inner_dim_size, in_r, in_c,
452 out_backprop + b * output_image_size, out_bprop_buf);
453
454 // Compute depthwise backprop input.
455 ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
456 in_c, filter_data, out_bprop_buf,
457 in_bprop_buf,
458 in_backprop + b * input_image_size);
459 }
460 }
461 }
462 };
463
464 const int64 shard_cost = args.in_rows * args.in_cols * args.out_depth;
465 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
466 Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
467 shard_cost, shard);
468 }
469 };
470
471 template <typename T>
DepthwiseConvBackpropInputReference(const DepthwiseArgs & args,const T * out_backprop,const T * filter,T * in_backprop)472 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
473 const T* out_backprop,
474 const T* filter,
475 T* in_backprop) {
476 // Naive for loop as a reference point without concerns about performance.
477 for (int b = 0; b < args.batch; ++b) {
478 for (int in_r = 0; in_r < args.in_rows; ++in_r) {
479 for (int in_c = 0; in_c < args.in_cols; ++in_c) {
480 for (int in_d = 0; in_d < args.in_depth; ++in_d) {
481 T sum = 0;
482 const int stride = args.stride;
483 const int out_d_start = in_d * args.depth_multiplier;
484 const int out_d_end = out_d_start + args.depth_multiplier;
485
486 for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
487 const int out_r_start = std::max(
488 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
489 const int out_r_end =
490 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
491
492 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
493 const int out_c_start = std::max(
494 0,
495 (in_c - args.filter_cols + args.pad_cols + stride) / stride);
496 const int out_c_end =
497 std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
498
499 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
500 int f_r = in_r + args.pad_rows - out_r * stride;
501 int f_c = in_c + args.pad_cols - out_c * stride;
502 int filter_dm = out_d - out_d_start;
503 int out_backprop_offset =
504 out_d +
505 args.out_depth *
506 (out_c + args.out_cols * (out_r + args.out_rows * b));
507 int filter_offset =
508 filter_dm +
509 args.depth_multiplier *
510 (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
511 sum +=
512 out_backprop[out_backprop_offset] * filter[filter_offset];
513 }
514 }
515 }
516
517 int in_backprop_offset =
518 in_d +
519 args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
520 in_backprop[in_backprop_offset] = sum;
521 }
522 }
523 }
524 }
525 }
526
527 // Extern template instantiated in conv_grad_input_ops.cc.
528 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
529 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
530 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
531
532 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
533
534 // Extern template instantiated in conv_grad_input_ops.cc.
535 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
536 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
537 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
538
539 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
540 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
541 Eigen::half>;
542 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
543 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
544
545 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
546
547 // Kernel to compute the input backprop for depthwise convolution.
548 template <typename Device, class T>
549 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
550 public:
DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction * context)551 explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
552 : OpKernel(context) {
553 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
554 OP_REQUIRES(context, strides_.size() == 4,
555 errors::InvalidArgument("Sliding window strides field must "
556 "specify 4 dimensions"));
557
558 string data_format;
559 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
560 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
561 errors::InvalidArgument("Invalid data format"));
562
563 stride_ = GetTensorDim(strides_, data_format_, 'H');
564 const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
565 const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
566 const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
567
568 OP_REQUIRES(context, stride_ == stride_w,
569 errors::InvalidArgument(
570 "Current implementation only supports equal length "
571 "strides in the row and column dimensions."));
572 OP_REQUIRES(
573 context, (stride_n == 1 && stride_c == 1),
574 errors::InvalidArgument("Current implementation does not yet support "
575 "strides in the batch and depth dimensions."));
576 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
577 OP_REQUIRES_OK(context,
578 context->GetAttr("explicit_paddings", &explicit_paddings_));
579 OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
580 /*num_dims=*/4, data_format_));
581
582 cudnn_use_autotune_ = CudnnUseAutotune();
583 dtype_ = DataTypeToEnum<T>::value;
584 #if CUDNN_VERSION >= 8000
585 // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
586 // NHWC depthwise kernels to support more combinations (filter
587 // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
588 // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
589 // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
590 // release-notes/rel_8.html#rel_8)
591 use_cudnn_grouped_conv_ =
592 dtype_ == DT_HALF &&
593 ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
594 (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
595 (stride_ == 1 || stride_ == 2)));
596 #elif CUDNN_VERSION >= 7603
597 // Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
598 // NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
599 // a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
600 use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
601 data_format_ == FORMAT_NCHW && stride_ == 1 &&
602 stride_w == 1;
603 #else
604 use_cudnn_grouped_conv_ = false;
605 #endif
606 }
607
Compute(OpKernelContext * context)608 void Compute(OpKernelContext* context) override {
609 const Tensor& input_sizes = context->input(0);
610 const Tensor& filter = context->input(1);
611 OP_REQUIRES(
612 context, TensorShapeUtils::IsVector(input_sizes.shape()),
613 errors::InvalidArgument(
614 "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
615 input_sizes.dims()));
616 TensorShape input_shape;
617 const int32* in_sizes_data = input_sizes.template flat<int32>().data();
618
619 for (int i = 0; i < input_sizes.NumElements(); ++i) {
620 OP_REQUIRES(context, in_sizes_data[i] >= 0,
621 errors::InvalidArgument("Dimension ", i,
622 " of input_sizes must be >= 0"));
623 input_shape.AddDim(in_sizes_data[i]);
624 }
625 const TensorShape& filter_shape = filter.shape();
626 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
627
628 Tensor* in_backprop = nullptr;
629 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
630 {0}, 0, input_shape, &in_backprop));
631
632 // If there is nothing to compute, return.
633 if (input_shape.num_elements() == 0) {
634 return;
635 }
636
637 // If in_depth==1, this operation is just a standard convolution.
638 // Depthwise convolution is a special case of cuDNN's grouped convolution.
639 bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
640 (in_depth == 1 ||
641 (use_cudnn_grouped_conv_ &&
642 IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
643 /*filter_cols=*/filter_cols,
644 /*in_depth=*/in_depth,
645 /*out_depth=*/out_depth)));
646
647 VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
648 << " Input: [" << batch << ", " << input_rows << ", " << input_cols
649 << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
650 << filter_cols << ", " << in_depth << ", " << depth_multiplier
651 << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
652 << ", " << out_depth << "], stride = " << stride_
653 << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
654 << ", Use cuDNN: " << use_cudnn;
655
656 if (use_cudnn) {
657 // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
658 //
659 // | TensorFlow | cuDNN
660 // --------------------------------------------------------------------
661 // filter_out_depth | depth_multiplier | depth_multiplier * group_count
662 // filter_in_depth | in_depth | in_depth / group_count
663 //
664 // For depthwise convolution, we have group_count == in_depth.
665 int32 filter_in_depth = 1;
666 TensorShape shape =
667 TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
668 Tensor reshaped_filter(/*type=*/dtype_);
669 OP_REQUIRES(
670 context, reshaped_filter.CopyFrom(filter, shape),
671 errors::Internal(
672 "Failed to reshape filter tensor for grouped convolution."));
673 // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
674 // conv is supported.
675 launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
676 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
677 stride_, stride_, padding_, explicit_paddings_, in_backprop,
678 data_format_);
679 return;
680 }
681
682 auto out_backprop_ptr = out_backprop.template flat<T>().data();
683 auto filter_ptr = filter.template flat<T>().data();
684 auto in_backprop_ptr = in_backprop->template flat<T>().data();
685 LaunchDepthwiseConvBackpropInputOp<Device, T>()(
686 context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
687 data_format_);
688 }
689
690 protected:
691 bool use_cudnn_grouped_conv_;
692
693 private:
694 std::vector<int32> strides_;
695 Padding padding_;
696 std::vector<int64> explicit_paddings_;
697 TensorFormat data_format_;
698 int64 stride_;
699
700 // For in_depth == 1 and grouped convolutions.
701 LaunchConv2DBackpropInputOp<Device, T> launcher_;
702 bool cudnn_use_autotune_;
703 DataType dtype_;
704
705 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
706 };
707
708 #define REGISTER_CPU_KERNEL(T) \
709 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
710 .Device(DEVICE_CPU) \
711 .TypeConstraint<T>("T"), \
712 DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
713
714 TF_CALL_half(REGISTER_CPU_KERNEL);
715 TF_CALL_float(REGISTER_CPU_KERNEL);
716 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
717 TF_CALL_double(REGISTER_CPU_KERNEL);
718 #endif
719 #undef REGISTER_CPU_KERNEL
720
721 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
722
723 #define REGISTER_GPU_KERNEL(T) \
724 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
725 .Device(DEVICE_GPU) \
726 .TypeConstraint<T>("T") \
727 .HostMemory("input_sizes"), \
728 DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
729
730 TF_CALL_half(REGISTER_GPU_KERNEL);
731 TF_CALL_float(REGISTER_GPU_KERNEL);
732 TF_CALL_double(REGISTER_GPU_KERNEL);
733 #undef REGISTER_GPU_KERNEL
734
735 #if CUDNN_VERSION >= 7000
736 template <typename T>
737 class DepthwiseConv2dGroupedConvBackpropInputOp
738 : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
739 public:
DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction * context)740 DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
741 : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
742 this->use_cudnn_grouped_conv_ = true;
743 }
744 };
745
746 #define REGISTER_GROUPED_CONV_KERNEL(T) \
747 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
748 .Device(DEVICE_GPU) \
749 .TypeConstraint<T>("T") \
750 .HostMemory("input_sizes") \
751 .Label("cudnn_grouped_convolution"), \
752 DepthwiseConv2dGroupedConvBackpropInputOp<T>)
753
754 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
755 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
756 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
757 #undef REGISTER_GROUPED_CONV_KERNEL
758 #endif // CUDNN_VERSION
759 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
760
761 // Kernels to compute the gradients of the filters for depthwise convolution.
762
763 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
764 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
765 //
766 // EX:
767 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
768 // Both 'input_buffer' and 'filter' are padded to register-width boundaries.
769 //
770 // 'input_buffer' [rows, cols, in_depth, depth_multiplier]
771 //
772 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
773 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
774 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
775 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
776 //
777 // 'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
778 //
779 // [q00, q01, q10, q11] [q20, q21, r00, r01]
780 // [r10, r11, r20, r21] [s00, s01, s10, s11]
781 // [s20, s21, t00, t01] [t10, t11, t20, a21]
782 //
783 // First output register of 'filter_backprop'
784 // [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
785 //
786 template <typename T>
ComputeBackpropFilter(const DepthwiseArgs & args,const int64 padded_out_depth_size,const int64 out_r,const int64 out_c,const T * out_backprop,const T * input_buffer,T * output_buffer)787 static void ComputeBackpropFilter(const DepthwiseArgs& args,
788 const int64 padded_out_depth_size,
789 const int64 out_r, const int64 out_c,
790 const T* out_backprop, const T* input_buffer,
791 T* output_buffer) {
792 typedef typename Eigen::internal::packet_traits<T>::type Packet;
793 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
794 // Calculate vectorized size of 'padded_out_depth_size'.
795 const int64 out_depth = args.out_depth;
796 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
797 const int64 output_vectorized_size =
798 (padded_out_depth_size / kPacketSize) * kPacketSize;
799 const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth;
800 // Determine whether we can execute fast or slow code path.
801 const int64 output_image_size =
802 args.out_rows * args.out_cols * args.out_depth;
803 const int64 output_last_vector_index =
804 output_image_size - (filter_spatial_size * padded_out_depth_size);
805 const bool fast_path = base_output_index <= output_last_vector_index;
806
807 if (fast_path) {
808 // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
809 // amortize the cost of 'output_buffer' load store in the loop below.
810 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
811 // Load vector register from 'out_backprop'.
812 const auto out_bprop_block =
813 Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
814 for (int j = 0; j < filter_spatial_size; ++j) {
815 const int64 index = i + j * padded_out_depth_size;
816 // Load vector register from 'input_buffer'.
817 const auto input_block =
818 Eigen::internal::ploadu<Packet>(input_buffer + index);
819 // Load output block into vector register.
820 auto out_block_data = output_buffer + index;
821 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
822 // Vector multiply-add.
823 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
824 out_block);
825 // Store 'out_block' back to memory.
826 Eigen::internal::pstoreu<T>(out_block_data, out_block);
827 }
828 }
829 } else {
830 // Slow path (cant do vector reads from non-padded 'out_backprop'.
831 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
832 // Calculate safe read size from 'out_backprop'.
833 const int64 out_bprop_index = base_output_index + i;
834 const int64 out_bprop_limit =
835 std::min(output_image_size, out_bprop_index + kPacketSize);
836 T out_buf[kPacketSize];
837 memset(&out_buf, 0, kPacketSize * sizeof(T));
838 const int64 scalar_size = out_bprop_limit - out_bprop_index;
839 for (int64 j = 0; j < scalar_size; ++j) {
840 out_buf[j] = out_backprop[out_bprop_index + j];
841 }
842 // Load vector register from 'out_buf'.
843 const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
844 for (int j = 0; j < filter_spatial_size; ++j) {
845 const int64 index = i + j * padded_out_depth_size;
846 // Load vector register from 'input_buffer'.
847 const auto input_block =
848 Eigen::internal::ploadu<Packet>(input_buffer + index);
849 // Load output block into vector register.
850 auto out_block_data = output_buffer + index;
851 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
852 // Vector multiply-add.
853 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
854 out_block);
855 // Store 'out_block' back to memory.
856 Eigen::internal::pstoreu<T>(out_block_data, out_block);
857 }
858 }
859 }
860 }
861
862 template <typename Device, typename T>
863 struct LaunchDepthwiseConvBackpropFilterOp;
864
865 template <typename T>
866 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
867 typedef typename Eigen::internal::packet_traits<T>::type Packet;
868
operator ()tensorflow::LaunchDepthwiseConvBackpropFilterOp869 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
870 const T* out_backprop, const T* input, T* filter_backprop,
871 TensorFormat data_format) {
872 OP_REQUIRES(
873 ctx, data_format == FORMAT_NHWC,
874 errors::Unimplemented(
875 "Depthwise convolution on CPU is only supported for NHWC format"));
876
877 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
878
879 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
880 const int64 padded_out_depth_size =
881 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
882
883 // Allocate output buffers for each image in 'batch' (padded to vector
884 // register boundaries).
885 Tensor output_buffer;
886 OP_REQUIRES_OK(
887 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
888 TensorShape({args.batch, filter_spatial_size,
889 padded_out_depth_size}),
890 &output_buffer));
891 T* output_buffer_data = output_buffer.template flat<T>().data();
892
893 // Computes one shard of depthwise conv2d backprop filter.
894 auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
895 int64 start, int64 limit) {
896 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
897 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
898 const int64 padded_out_depth_size =
899 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
900
901 // Allocate buffer for local input regions.
902 Tensor input_buffer;
903 OP_REQUIRES_OK(
904 ctx, ctx->allocate_temp(
905 DataTypeToEnum<T>::value,
906 TensorShape({filter_spatial_size, padded_out_depth_size}),
907 &input_buffer));
908 T* input_buffer_data = input_buffer.template flat<T>().data();
909
910 const int64 input_image_size =
911 args.in_rows * args.in_cols * args.in_depth;
912 const int64 output_image_size =
913 args.out_rows * args.out_cols * args.out_depth;
914 const int64 padded_filter_size =
915 filter_spatial_size * padded_out_depth_size;
916
917 for (int b = start; b < limit; ++b) {
918 // Initialize 'output_buffer' for 'b'.
919 auto* output_buffer = output_buffer_data + b * padded_filter_size;
920 memset(output_buffer, 0, padded_filter_size * sizeof(T));
921
922 for (int out_r = 0; out_r < args.out_rows; ++out_r) {
923 for (int out_c = 0; out_c < args.out_cols; ++out_c) {
924 // Populate 'input_buffer_data' with data from local input region.
925 functor::DepthwiseInputCopyOp<T>()(
926 args, padded_out_depth_size, out_r, out_c,
927 input + b * input_image_size, input_buffer_data);
928 // Compute depthwise backprop filter.
929 ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
930 out_backprop + b * output_image_size,
931 input_buffer_data, output_buffer);
932 }
933 }
934 }
935 };
936 const int64 shard_cost = args.out_rows * args.out_cols * args.out_depth;
937 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
938 Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
939 shard_cost, shard);
940
941 // Accumulate 'output_buffer' from each shard into 'output'.
942 const int64 out_depth = args.out_depth;
943 const int64 vectorized_size = (out_depth / kPacketSize) * kPacketSize;
944 const int64 scalar_size = out_depth - vectorized_size;
945 const int64 padded_filter_size =
946 filter_spatial_size * padded_out_depth_size;
947 memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
948
949 for (int64 i = 0; i < filter_spatial_size; ++i) {
950 const int64 buffer_base = i * padded_out_depth_size;
951 const int64 output_base = i * out_depth;
952 // Write vectorized length of filter's inner dimension to output.
953 for (int64 j = 0; j < vectorized_size; j += kPacketSize) {
954 // Load data from 'filter_backprop' into vector register.
955 auto out_block_data = filter_backprop + output_base + j;
956 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
957 for (int b = 0; b < args.batch; ++b) {
958 // Load data from 'output_buffer' for 'b'.
959 const auto* output_buffer =
960 output_buffer_data + b * padded_filter_size;
961 const auto v =
962 Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
963 // Add 'v' to 'out_block'.
964 out_block = Eigen::internal::padd<Packet>(out_block, v);
965 }
966 // Store 'out_block' back to memory.
967 Eigen::internal::pstoreu<T>(out_block_data, out_block);
968 }
969 // Write scalar length of filter's inner dimension to output.
970 for (int64 j = 0; j < scalar_size; ++j) {
971 for (int b = 0; b < args.batch; ++b) {
972 const auto* output_buffer =
973 output_buffer_data + b * padded_filter_size;
974 filter_backprop[output_base + vectorized_size + j] +=
975 output_buffer[buffer_base + vectorized_size + j];
976 }
977 }
978 }
979 }
980 };
981
982 template <typename T>
DepthwiseConvBackpropFilterReference(const DepthwiseArgs & args,const T * out_backprop,const T * input,T * filter_backprop)983 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
984 const T* out_backprop,
985 const T* input,
986 T* filter_backprop) {
987 int num_filter_backprop = args.filter_rows * args.filter_cols *
988 args.in_depth * args.depth_multiplier;
989 memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
990 // Naive for loop as a reference point without concerns about performance.
991 for (int b = 0; b < args.batch; ++b) {
992 for (int out_r = 0; out_r < args.out_rows; ++out_r) {
993 for (int out_c = 0; out_c < args.out_cols; ++out_c) {
994 for (int out_d = 0; out_d < args.out_depth; ++out_d) {
995 const int in_d = out_d / args.depth_multiplier;
996 const int dm = out_d % args.depth_multiplier;
997 const int in_r_start = out_r * args.stride - args.pad_rows;
998 const int in_c_start = out_c * args.stride - args.pad_cols;
999
1000 for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
1001 for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
1002 const int in_r = in_r_start + f_r;
1003 const int in_c = in_c_start + f_c;
1004
1005 if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
1006 in_c < args.in_cols) {
1007 int out_backprop_offset =
1008 out_d +
1009 args.out_depth *
1010 (out_c + args.out_cols * (out_r + args.out_rows * b));
1011 int input_offset =
1012 in_d +
1013 args.in_depth *
1014 (in_c + args.in_cols * (in_r + args.in_rows * b));
1015 int filter_backprop_offset =
1016 dm +
1017 args.depth_multiplier *
1018 (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
1019 filter_backprop[filter_backprop_offset] +=
1020 input[input_offset] * out_backprop[out_backprop_offset];
1021 }
1022 }
1023 }
1024 }
1025 }
1026 }
1027 }
1028 }
1029
1030 // Extern template instantiated in conv_grad_filter_ops.cc.
1031 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
1032 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
1033 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
1034
1035 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1036
1037 // Extern template instantiated in conv_grad_filter_ops.cc.
1038 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
1039 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
1040 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1041
1042 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1043 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1044 Eigen::half>;
1045 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1046 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1047
1048 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1049
1050 // Kernel to compute the filter backprop for depthwise convolution.
1051 template <typename Device, class T>
1052 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1053 public:
DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction * context)1054 explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1055 : OpKernel(context) {
1056 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1057 OP_REQUIRES(context, strides_.size() == 4,
1058 errors::InvalidArgument("Sliding window strides field must "
1059 "specify 4 dimensions"));
1060
1061 string data_format;
1062 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1063 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1064 errors::InvalidArgument("Invalid data format"));
1065
1066 stride_ = GetTensorDim(strides_, data_format_, 'H');
1067 const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
1068 const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
1069 const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
1070
1071 OP_REQUIRES(context, stride_ == stride_w,
1072 errors::InvalidArgument(
1073 "Current implementation only supports equal length "
1074 "strides in the row and column dimensions."));
1075 OP_REQUIRES(
1076 context, (stride_n == 1 && stride_c == 1),
1077 errors::InvalidArgument("Current implementation does not yet support "
1078 "strides in the batch and depth dimensions."));
1079 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1080 OP_REQUIRES_OK(context,
1081 context->GetAttr("explicit_paddings", &explicit_paddings_));
1082 OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
1083 /*num_dims=*/4, data_format_));
1084
1085 cudnn_use_autotune_ = CudnnUseAutotune();
1086
1087 if (std::is_same<T, Eigen::half>::value) {
1088 dtype_ = DT_HALF;
1089 } else if (std::is_same<T, float>::value) {
1090 dtype_ = DT_FLOAT;
1091 } else if (std::is_same<T, double>::value) {
1092 dtype_ = DT_DOUBLE;
1093 } else {
1094 LOG(ERROR) << "Only half, float, and double are supported.";
1095 }
1096 // Use CuDNN grouped conv (filter gradients) when input/output is
1097 // float16(half). See cudnn release note 7.6.3. (https://docs.nvidia.com/dee
1098 // plearning/sdk/cudnn-release-notes/rel_763.html#rel_763)
1099 #if CUDNN_VERSION >= 7603
1100 use_cudnn_grouped_conv_ = dtype_ == DT_HALF;
1101 #else
1102 use_cudnn_grouped_conv_ = false;
1103 #endif
1104 }
1105
Compute(OpKernelContext * context)1106 void Compute(OpKernelContext* context) override {
1107 const Tensor& input = context->input(0);
1108 const Tensor& filter_sizes = context->input(1);
1109 OP_REQUIRES(
1110 context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1111 errors::InvalidArgument(
1112 "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1113 filter_sizes.dims()));
1114 TensorShape filter_shape;
1115 const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1116 for (int i = 0; i < filter_sizes.NumElements(); ++i) {
1117 OP_REQUIRES(context, filter_sizes_data[i] >= 0,
1118 errors::InvalidArgument("Dimension ", i,
1119 " of filter_sizes must be >= 0"));
1120 filter_shape.AddDim(filter_sizes_data[i]);
1121 }
1122 const TensorShape& input_shape = input.shape();
1123
1124 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1125 Tensor* filter_backprop = nullptr;
1126 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1127 {1}, 0, filter_shape, &filter_backprop));
1128
1129 // If there is nothing to compute, return.
1130 if (out_backprop.shape().num_elements() == 0) {
1131 return;
1132 }
1133
1134 // If in_depth==1, this operation is just a standard convolution.
1135 // Depthwise convolution is a special case of cuDNN's grouped convolution.
1136 bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
1137 (in_depth == 1 ||
1138 (use_cudnn_grouped_conv_ &&
1139 IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
1140 /*filter_cols=*/filter_cols,
1141 /*in_depth=*/in_depth,
1142 /*out_depth=*/out_depth)));
1143
1144 VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
1145 << " Input: [" << batch << ", " << input_rows << ", " << input_cols
1146 << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1147 << filter_cols << ", " << in_depth << ", " << depth_multiplier
1148 << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1149 << ", " << out_depth << "], stride = " << stride_
1150 << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
1151 << ", Use cuDNN: " << use_cudnn;
1152
1153 if (use_cudnn) {
1154 // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1155 //
1156 // | TensorFlow | cuDNN
1157 // --------------------------------------------------------------------
1158 // filter_out_depth | depth_multiplier | depth_multiplier * group_count
1159 // filter_in_depth | in_depth | in_depth / group_count
1160 //
1161 // For depthwise convolution, we have group_count == in_depth.
1162 int32 filter_in_depth = 1;
1163 TensorShape shape =
1164 TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
1165 Tensor reshaped_filter(/*type=*/dtype_);
1166 OP_REQUIRES(
1167 context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1168 errors::Internal(
1169 "Failed to reshape filter tensor for grouped convolution."));
1170
1171 // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1172 // conv is supported.
1173 launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
1174 input,
1175 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
1176 padding_, explicit_paddings_, &reshaped_filter, data_format_);
1177 return;
1178 }
1179
1180 // For GPU inputs with type half, we cast inputs to float and outputs back
1181 // to half, as half implementation is slow and does not use full precision
1182 // accumulation in some cases.
1183 constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
1184 std::is_same<Device, GPUDevice>::value;
1185 using U = typename std::conditional<cast_to_float, float, T>::type;
1186 Tensor casted_out_backprop = out_backprop;
1187 Tensor casted_input = input;
1188 Tensor casted_filter_backprop = *filter_backprop;
1189 const Device& device = context->template eigen_device<Device>();
1190 if (cast_to_float) {
1191 functor::CastFunctor<Device, float, Eigen::half> cast;
1192 OP_REQUIRES_OK(context,
1193 context->allocate_temp(DT_FLOAT, out_backprop.shape(),
1194 &casted_out_backprop));
1195 cast(device, casted_out_backprop.template flat<float>(),
1196 out_backprop.template flat<Eigen::half>());
1197 OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
1198 &casted_input));
1199 cast(device, casted_input.template flat<float>(),
1200 input.template flat<Eigen::half>());
1201 OP_REQUIRES_OK(context,
1202 context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
1203 &casted_filter_backprop));
1204 }
1205
1206 auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
1207 auto input_ptr = casted_input.template flat<U>().data();
1208 auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
1209 LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
1210 context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1211 data_format_);
1212
1213 if (cast_to_float) {
1214 functor::CastFunctor<Device, Eigen::half, float> cast;
1215 const Tensor& casted_filter_backprop_const = casted_filter_backprop;
1216 cast(device, filter_backprop->template flat<Eigen::half>(),
1217 casted_filter_backprop_const.template flat<float>());
1218 }
1219 }
1220
1221 protected:
1222 bool use_cudnn_grouped_conv_;
1223
1224 private:
1225 std::vector<int32> strides_;
1226 Padding padding_;
1227 std::vector<int64> explicit_paddings_;
1228 TensorFormat data_format_;
1229 int64 stride_;
1230
1231 // For in_depth == 1 and grouped convolutions.
1232 LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1233 bool cudnn_use_autotune_;
1234 DataType dtype_;
1235
1236 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1237 };
1238
1239 #define REGISTER_CPU_KERNEL(T) \
1240 REGISTER_KERNEL_BUILDER( \
1241 Name("DepthwiseConv2dNativeBackpropFilter") \
1242 .Device(DEVICE_CPU) \
1243 .TypeConstraint<T>("T"), \
1244 DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1245 TF_CALL_half(REGISTER_CPU_KERNEL);
1246 TF_CALL_float(REGISTER_CPU_KERNEL);
1247 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
1248 TF_CALL_double(REGISTER_CPU_KERNEL);
1249 #endif
1250 #undef REGISTER_CPU_KERNEL
1251
1252 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1253 #define REGISTER_GPU_KERNEL(T) \
1254 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1255 .Device(DEVICE_GPU) \
1256 .TypeConstraint<T>("T") \
1257 .HostMemory("filter_sizes"), \
1258 DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1259
1260 TF_CALL_half(REGISTER_GPU_KERNEL);
1261 TF_CALL_float(REGISTER_GPU_KERNEL);
1262 TF_CALL_double(REGISTER_GPU_KERNEL);
1263 #undef REGISTER_GPU_KERNEL
1264
1265 #if CUDNN_VERSION >= 7000
1266 template <typename T>
1267 class DepthwiseConv2dGroupedConvBackpropFilterOp
1268 : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1269 public:
DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction * context)1270 DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1271 : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1272 this->use_cudnn_grouped_conv_ = true;
1273 }
1274 };
1275
1276 #define REGISTER_GROUPED_CONV_KERNEL(T) \
1277 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1278 .Device(DEVICE_GPU) \
1279 .TypeConstraint<T>("T") \
1280 .HostMemory("filter_sizes") \
1281 .Label("cudnn_grouped_convolution"), \
1282 DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1283
1284 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1285 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1286 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1287 #undef REGISTER_GROUPED_CONV_KERNEL
1288 #endif // CUDNN_VERSION
1289 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1290
1291 } // namespace tensorflow
1292