• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define EIGEN_USE_THREADS
17 
18 #include <algorithm>
19 #include <cmath>
20 
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/kernel_shape_util.h"
23 #include "tensorflow/core/framework/numeric_op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_types.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/kernels/cast_op.h"
31 #include "tensorflow/core/kernels/conv_grad_ops.h"
32 #include "tensorflow/core/kernels/depthwise_conv_op.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/platform/logging.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/padding.h"
37 #include "tensorflow/core/util/tensor_format.h"
38 #include "tensorflow/core/util/use_cudnn.h"
39 #include "tensorflow/core/util/work_sharder.h"
40 
41 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
42 
43 #if GOOGLE_CUDA
44 #include "third_party/gpus/cudnn/cudnn.h"
45 #endif
46 
47 #include "tensorflow/core/platform/stream_executor.h"
48 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
49 
50 namespace tensorflow {
51 
52 // Gradient operations for depthwise convolution.
53 
54 typedef Eigen::ThreadPoolDevice CPUDevice;
55 typedef Eigen::GpuDevice GPUDevice;
56 
57 // Common code between the two backward pass kernels: verifies that the
58 // dimensions all match and extract the padded rows and columns.
59 #define EXTRACT_AND_VERIFY_DIMENSIONS(label)                                   \
60   const Tensor& out_backprop = context->input(2);                              \
61   OP_REQUIRES(                                                                 \
62       context, input_shape.dims() == 4,                                        \
63       errors::InvalidArgument(label, ": input must be 4-dimensional"));        \
64   OP_REQUIRES(                                                                 \
65       context, filter_shape.dims() == 4,                                       \
66       errors::InvalidArgument(label, ": filter must be 4-dimensional"));       \
67   OP_REQUIRES(                                                                 \
68       context, out_backprop.dims() == 4,                                       \
69       errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
70   const int64 batch = input_shape.dim_size(0);                                 \
71   OP_REQUIRES(                                                                 \
72       context, batch == out_backprop.dim_size(0),                              \
73       errors::InvalidArgument(                                                 \
74           label, ": input and out_backprop must have the same batch size"));   \
75   const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');   \
76   OP_REQUIRES(                                                                 \
77       context,                                                                 \
78       FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),      \
79       errors::InvalidArgument("Input rows too large"));                        \
80   const int32 input_rows = static_cast<int32>(input_rows_raw);                 \
81   const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');   \
82   OP_REQUIRES(                                                                 \
83       context,                                                                 \
84       FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),      \
85       errors::InvalidArgument("Input cols too large"));                        \
86   const int32 input_cols = static_cast<int32>(input_cols_raw);                 \
87   const int64 filter_rows = filter_shape.dim_size(0);                          \
88   const int64 filter_cols = filter_shape.dim_size(1);                          \
89   const int64 output_rows_raw =                                                \
90       GetTensorDim(out_backprop.shape(), data_format_, 'H');                   \
91   OP_REQUIRES(                                                                 \
92       context,                                                                 \
93       FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()),     \
94       errors::InvalidArgument("Output rows too large"));                       \
95   const int32 output_rows = static_cast<int32>(output_rows_raw);               \
96   const int64 output_cols_raw =                                                \
97       GetTensorDim(out_backprop.shape(), data_format_, 'W');                   \
98   OP_REQUIRES(                                                                 \
99       context,                                                                 \
100       FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()),     \
101       errors::InvalidArgument("Output cols too large"));                       \
102   const int32 output_cols = static_cast<int32>(output_cols_raw);               \
103   const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');         \
104   OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
105               errors::InvalidArgument(                                         \
106                   label, ": input and filter must have the same in_depth"));   \
107   const int64 depth_multiplier = filter_shape.dim_size(3);                     \
108   const int64 out_depth_raw =                                                  \
109       GetTensorDim(out_backprop.shape(), data_format_, 'C');                   \
110   OP_REQUIRES(                                                                 \
111       context,                                                                 \
112       FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()),       \
113       errors::InvalidArgument("Output depth too large"));                      \
114   const int32 out_depth = static_cast<int32>(out_depth_raw);                   \
115   OP_REQUIRES(                                                                 \
116       context, (depth_multiplier * in_depth) == out_depth,                     \
117       errors::InvalidArgument(                                                 \
118           label, ": depth_multiplier * in_depth not equal to out_depth"));     \
119   const auto stride = stride_;                                                 \
120   int64 out_rows = 0, out_cols = 0, pad_top = 0, pad_bottom = 0, pad_left = 0, \
121         pad_right = 0;                                                         \
122   if (padding_ == Padding::EXPLICIT) {                                         \
123     GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H', &pad_top,  \
124                              &pad_bottom);                                     \
125     GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W', &pad_left, \
126                              &pad_right);                                      \
127   }                                                                            \
128   OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(                        \
129                               input_rows, filter_rows, stride_, padding_,      \
130                               &out_rows, &pad_top, &pad_bottom));              \
131   OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(                        \
132                               input_cols, filter_cols, stride_, padding_,      \
133                               &out_cols, &pad_left, &pad_right));              \
134   OP_REQUIRES(                                                                 \
135       context, output_rows == out_rows,                                        \
136       errors::InvalidArgument(                                                 \
137           label, ": Number of rows of out_backprop doesn't match computed: ",  \
138           "actual = ", output_rows, ", computed = ", out_rows));               \
139   OP_REQUIRES(                                                                 \
140       context, output_cols == out_cols,                                        \
141       errors::InvalidArgument(                                                 \
142           label, ": Number of cols of out_backprop doesn't match computed: ",  \
143           "actual = ", output_cols, ", computed = ", out_cols));               \
144   DepthwiseArgs args;                                                          \
145   args.batch = batch;                                                          \
146   args.in_rows = input_rows;                                                   \
147   args.in_cols = input_cols;                                                   \
148   args.in_depth = in_depth;                                                    \
149   args.filter_rows = filter_rows;                                              \
150   args.filter_cols = filter_cols;                                              \
151   args.depth_multiplier = depth_multiplier;                                    \
152   args.stride = stride;                                                        \
153   args.pad_rows = pad_top;                                                     \
154   args.pad_cols = pad_left;                                                    \
155   args.out_rows = out_rows;                                                    \
156   args.out_cols = out_cols;                                                    \
157   args.out_depth = out_depth;                                                  \
158   VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", "      \
159           << input_rows << ", " << input_cols << ", " << in_depth              \
160           << "]; Filter: [" << filter_rows << ", " << filter_cols << ", "      \
161           << in_depth << ", " << depth_multiplier << "]; stride = " << stride  \
162           << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left         \
163           << ", output: [" << batch << ", " << out_rows << ", " << out_cols    \
164           << ", " << out_depth << "]";
165 
166 // Copies data from local region in 'out_backprop' into 'buffer'.
167 // The local region coordinates are calculated as the set of output points which
168 // used the input point ('in_r', 'in_'c') as input during the forward pass.
169 // Rather than spatially reversing the filter, the input is reversed during
170 // the copy. The copied data is padded to vector register-width boundaries so
171 // that it is aligned for efficient traversal and vector multiply-add by the
172 // depthwise input kernel.
173 //
174 // EX:
175 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
176 //
177 //   'out_backprop': [batch, out_rows, out_cols, out_depth]
178 //
179 //     [a00, a01, a10, a11] [a20, a21, b00, b01]
180 //     [b10, b11, b20, b21] [...]
181 //     [e00, e01, e10, e11] [e20, e21, f00, f01]
182 //     [f10, f11, f20, f21] [...]
183 //
184 //   'buffer' (register boundaries shown):
185 //
186 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
187 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
188 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
189 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
190 //
191 template <typename T>
CopyOutputBackpropRegion(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * out_backprop,T * buffer)192 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
193                                      const int64 padded_filter_inner_dim_size,
194                                      const int64 in_r, const int64 in_c,
195                                      const T* out_backprop, T* buffer) {
196   typedef typename Eigen::internal::packet_traits<T>::type Packet;
197   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
198 
199   const int64 stride = args.stride;
200   const int64 filter_rows = args.filter_rows;
201   const int64 filter_cols = args.filter_cols;
202   const int64 pad_rows = args.pad_rows;
203   const int64 pad_cols = args.pad_cols;
204   const int64 out_rows = args.out_rows;
205   const int64 out_cols = args.out_cols;
206 
207   // Calculate the output spatial region which used point (in_r, in_c) as input.
208   const int64 out_r_start = std::max(
209       static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride);
210   const int64 out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
211   const int64 out_c_start = std::max(
212       static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride);
213   const int64 out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
214 
215   // Zero-pad 'buffer' if output region is smaller than filter spatial size.
216   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
217   if ((out_r_end - out_r_start + 1) < args.filter_rows ||
218       (out_c_end - out_c_start + 1) < args.filter_cols) {
219     memset(buffer, 0,
220            filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
221   }
222 
223   // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
224   const int64 vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
225   const int64 scalar_size = args.out_depth % kPacketSize;
226   const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
227 
228   for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
229     const int64 f_r = in_r + pad_rows - out_r * stride;
230     for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
231       const int64 f_c = in_c + pad_cols - out_c * stride;
232       const int64 buf_base =
233           (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
234       // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
235       auto* out_bprop =
236           out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
237 
238       // Copy vectorized portion of inner dimension into 'buffer'.
239       for (int64 d = 0; d < vectorized_size; d += kPacketSize) {
240         auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
241         Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
242       }
243       // Copy scalar portion of out_bprop to 'buffer'
244       for (int64 d = 0; d < scalar_size; ++d) {
245         buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
246       }
247       // Pad to vector-register width (if needed).
248       for (int64 d = 0; d < pad_size; ++d) {
249         buffer[buf_base + vectorized_size + scalar_size + d] =
250             static_cast<T>(0);
251       }
252     }
253   }
254 }
255 
256 // Computes the vectorized product of 'buffer' and 'filter' and stores
257 // result in 'output' at location computed from 'in_r' and 'in_c'.
258 // If depth_multiplier is > 1, the intermediate output is reduced along
259 // the depth_multiplier dimension.
260 //
261 // EX:
262 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
263 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
264 //
265 //   'buffer' [rows, cols, in_depth, depth_multiplier]
266 //
267 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
268 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
269 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
270 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
271 //
272 //   filter [rows, cols, in_depth, depth_multiplier]
273 //     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
274 //     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
275 //
276 //   First output register [in_depth, depth_multiplier]
277 //     [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
278 //                            ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
279 //                            ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
280 //                            ([a00, a01, a10, a11] x [u3, v3, w3, x3])
281 //
282 //   Reduction step along depth-multiplier dimension:
283 //
284 //     [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
285 //
286 
287 template <typename T>
ComputeBackpropInput(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * filter,const T * buffer,T * out_buffer,T * output)288 static void ComputeBackpropInput(const DepthwiseArgs& args,
289                                  const int64 padded_filter_inner_dim_size,
290                                  const int64 in_r, const int64 in_c,
291                                  const T* filter, const T* buffer,
292                                  T* out_buffer, T* output) {
293   typedef typename Eigen::internal::packet_traits<T>::type Packet;
294   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
295 
296   const int64 in_depth = args.in_depth;
297   const int64 depth_multiplier = args.depth_multiplier;
298   const int64 out_depth = args.out_depth;
299   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
300 
301   // Calculate vectorized and scalar lengths of 'out_depth'.
302   const int64 output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
303   const int64 output_scalar_size = out_depth % kPacketSize;
304 
305   // Calculate base index at which to begin writing output.
306   const int64 base_output_index = (in_r * args.in_cols + in_c) * in_depth;
307 
308   // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
309   // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
310   const int64 dm_vectorized_size =
311       (depth_multiplier / kPacketSize) * kPacketSize;
312   const int64 dm_scalar_size = depth_multiplier % kPacketSize;
313 
314   for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
315     // Reset accumulator.
316     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
317     for (int j = 0; j < filter_spatial_size; ++j) {
318       // Calculate index.
319       const int64 index = i + j * padded_filter_inner_dim_size;
320       // Load filter.
321       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
322       // Load input.
323       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
324       // Vector multiply-add.
325       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
326     }
327     if (depth_multiplier == 1) {
328       // Write directly to the output.
329       Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
330     } else {
331       // Buffer output for subsequent reduction step.
332       Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
333     }
334   }
335 
336   if (output_scalar_size > 0) {
337     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
338     for (int j = 0; j < filter_spatial_size; ++j) {
339       const int64 index =
340           output_vectorized_size + j * padded_filter_inner_dim_size;
341       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
342       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
343       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
344     }
345     // Load accumulator into an array and loop through output.
346     T out_buf[kPacketSize];
347     Eigen::internal::pstoreu<T>(out_buf, vaccum);
348     if (depth_multiplier == 1) {
349       // Write directly to the output.
350       for (int j = 0; j < output_scalar_size; ++j) {
351         output[base_output_index + output_vectorized_size + j] = out_buf[j];
352       }
353     } else {
354       // Buffer output for subsequent reduction step.
355       for (int j = 0; j < output_scalar_size; ++j) {
356         out_buffer[output_vectorized_size + j] = out_buf[j];
357       }
358     }
359   }
360 
361   // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
362   if (depth_multiplier > 1) {
363     for (int64 d = 0; d < in_depth; ++d) {
364       const int64 index = d * args.depth_multiplier;
365       T accum = static_cast<T>(0);
366       for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
367         const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
368         accum += Eigen::internal::predux(v);
369       }
370       // Copy scalar portion of replicated output.
371       for (int64 dm = 0; dm < dm_scalar_size; ++dm) {
372         accum += out_buffer[index + dm_vectorized_size + dm];
373       }
374       // Copy to output.
375       output[base_output_index + d] = accum;
376     }
377   }
378 }
379 
380 // Computes the depthwise conv2d backprop input of 'out_backprop' by
381 // 'depthwise_filter' and stores the result in 'in_backprop'.
382 template <typename T>
383 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
384   typedef typename Eigen::internal::packet_traits<T>::type Packet;
385 
operator ()tensorflow::LaunchDepthwiseConvBackpropInputOp386   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
387                   const T* out_backprop, const T* depthwise_filter,
388                   T* in_backprop, TensorFormat data_format) {
389     OP_REQUIRES(
390         ctx, data_format == FORMAT_NHWC,
391         errors::Unimplemented(
392             "Depthwise convolution on CPU is only supported for NHWC format"));
393 
394     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
395 
396     // Pad 'depthwise_filter' to vector register width (if needed).
397     const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
398     Tensor padded_filter;
399     if (pad_filter) {
400       // Allocate space for padded filter.
401       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
402       const int64 padded_filter_inner_dim_size =
403           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
404       OP_REQUIRES_OK(
405           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
406                                   TensorShape({filter_spatial_size,
407                                                padded_filter_inner_dim_size}),
408                                   &padded_filter));
409       // Write out padded filter.
410       functor::DepthwiseFilterPadOp<T>()(
411           args, depthwise_filter, padded_filter.template flat<T>().data());
412     }
413     const T* filter_data =
414         pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
415 
416     // Computes one shard of depthwise conv2d backprop input.
417     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
418                      int64 start, int64 limit) {
419       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
420 
421       const int64 input_image_size =
422           args.in_rows * args.in_cols * args.in_depth;
423       const int64 output_image_size =
424           args.out_rows * args.out_cols * args.out_depth;
425       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
426       const int64 padded_filter_inner_dim_size =
427           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
428 
429       // Allocate buffer to copy regions from 'out_backprop'.
430       Tensor out_bprop_buffer;
431       OP_REQUIRES_OK(
432           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
433                                   TensorShape({filter_spatial_size,
434                                                padded_filter_inner_dim_size}),
435                                   &out_bprop_buffer));
436       T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
437 
438       // Allocate buffer for intermediate results.
439       Tensor in_bprop_buffer;
440       OP_REQUIRES_OK(
441           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
442                                   TensorShape({padded_filter_inner_dim_size}),
443                                   &in_bprop_buffer));
444       T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
445 
446       for (int64 b = start; b < limit; ++b) {
447         for (int64 in_r = 0; in_r < args.in_rows; ++in_r) {
448           for (int64 in_c = 0; in_c < args.in_cols; ++in_c) {
449             // Populate 'out_bprop_buf' from local 'out_backprop' region.
450             CopyOutputBackpropRegion<T>(
451                 args, padded_filter_inner_dim_size, in_r, in_c,
452                 out_backprop + b * output_image_size, out_bprop_buf);
453 
454             // Compute depthwise backprop input.
455             ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
456                                     in_c, filter_data, out_bprop_buf,
457                                     in_bprop_buf,
458                                     in_backprop + b * input_image_size);
459           }
460         }
461       }
462     };
463 
464     const int64 shard_cost = args.in_rows * args.in_cols * args.out_depth;
465     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
466     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
467           shard_cost, shard);
468   }
469 };
470 
471 template <typename T>
DepthwiseConvBackpropInputReference(const DepthwiseArgs & args,const T * out_backprop,const T * filter,T * in_backprop)472 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
473                                                 const T* out_backprop,
474                                                 const T* filter,
475                                                 T* in_backprop) {
476   // Naive for loop as a reference point without concerns about performance.
477   for (int b = 0; b < args.batch; ++b) {
478     for (int in_r = 0; in_r < args.in_rows; ++in_r) {
479       for (int in_c = 0; in_c < args.in_cols; ++in_c) {
480         for (int in_d = 0; in_d < args.in_depth; ++in_d) {
481           T sum = 0;
482           const int stride = args.stride;
483           const int out_d_start = in_d * args.depth_multiplier;
484           const int out_d_end = out_d_start + args.depth_multiplier;
485 
486           for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
487             const int out_r_start = std::max(
488                 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
489             const int out_r_end =
490                 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
491 
492             for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
493               const int out_c_start = std::max(
494                   0,
495                   (in_c - args.filter_cols + args.pad_cols + stride) / stride);
496               const int out_c_end =
497                   std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
498 
499               for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
500                 int f_r = in_r + args.pad_rows - out_r * stride;
501                 int f_c = in_c + args.pad_cols - out_c * stride;
502                 int filter_dm = out_d - out_d_start;
503                 int out_backprop_offset =
504                     out_d +
505                     args.out_depth *
506                         (out_c + args.out_cols * (out_r + args.out_rows * b));
507                 int filter_offset =
508                     filter_dm +
509                     args.depth_multiplier *
510                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
511                 sum +=
512                     out_backprop[out_backprop_offset] * filter[filter_offset];
513               }
514             }
515           }
516 
517           int in_backprop_offset =
518               in_d +
519               args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
520           in_backprop[in_backprop_offset] = sum;
521         }
522       }
523     }
524   }
525 }
526 
527 // Extern template instantiated in conv_grad_input_ops.cc.
528 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
529 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
530 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
531 
532 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
533 
534 // Extern template instantiated in conv_grad_input_ops.cc.
535 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
536 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
537 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
538 
539 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
540 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
541                                                           Eigen::half>;
542 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
543 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
544 
545 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
546 
547 // Kernel to compute the input backprop for depthwise convolution.
548 template <typename Device, class T>
549 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
550  public:
DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction * context)551   explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
552       : OpKernel(context) {
553     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
554     OP_REQUIRES(context, strides_.size() == 4,
555                 errors::InvalidArgument("Sliding window strides field must "
556                                         "specify 4 dimensions"));
557 
558     string data_format;
559     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
560     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
561                 errors::InvalidArgument("Invalid data format"));
562 
563     stride_ = GetTensorDim(strides_, data_format_, 'H');
564     const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
565     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
566     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
567 
568     OP_REQUIRES(context, stride_ == stride_w,
569                 errors::InvalidArgument(
570                     "Current implementation only supports equal length "
571                     "strides in the row and column dimensions."));
572     OP_REQUIRES(
573         context, (stride_n == 1 && stride_c == 1),
574         errors::InvalidArgument("Current implementation does not yet support "
575                                 "strides in the batch and depth dimensions."));
576     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
577     OP_REQUIRES_OK(context,
578                    context->GetAttr("explicit_paddings", &explicit_paddings_));
579     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
580                                               /*num_dims=*/4, data_format_));
581 
582     cudnn_use_autotune_ = CudnnUseAutotune();
583     dtype_ = DataTypeToEnum<T>::value;
584 #if CUDNN_VERSION >= 8000
585     // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
586     // NHWC depthwise kernels to support more combinations (filter
587     // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
588     // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
589     // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
590     // release-notes/rel_8.html#rel_8)
591     use_cudnn_grouped_conv_ =
592         dtype_ == DT_HALF &&
593         ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
594          (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
595           (stride_ == 1 || stride_ == 2)));
596 #elif CUDNN_VERSION >= 7603
597     // Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
598     // NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
599     // a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
600     use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
601                               data_format_ == FORMAT_NCHW && stride_ == 1 &&
602                               stride_w == 1;
603 #else
604     use_cudnn_grouped_conv_ = false;
605 #endif
606   }
607 
Compute(OpKernelContext * context)608   void Compute(OpKernelContext* context) override {
609     const Tensor& input_sizes = context->input(0);
610     const Tensor& filter = context->input(1);
611     OP_REQUIRES(
612         context, TensorShapeUtils::IsVector(input_sizes.shape()),
613         errors::InvalidArgument(
614             "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
615             input_sizes.dims()));
616     TensorShape input_shape;
617     const int32* in_sizes_data = input_sizes.template flat<int32>().data();
618 
619     for (int i = 0; i < input_sizes.NumElements(); ++i) {
620       OP_REQUIRES(context, in_sizes_data[i] >= 0,
621                   errors::InvalidArgument("Dimension ", i,
622                                           " of input_sizes must be >= 0"));
623       input_shape.AddDim(in_sizes_data[i]);
624     }
625     const TensorShape& filter_shape = filter.shape();
626     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
627 
628     Tensor* in_backprop = nullptr;
629     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
630                                 {0}, 0, input_shape, &in_backprop));
631 
632     // If there is nothing to compute, return.
633     if (input_shape.num_elements() == 0) {
634       return;
635     }
636 
637     // If in_depth==1, this operation is just a standard convolution.
638     // Depthwise convolution is a special case of cuDNN's grouped convolution.
639     bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
640                      (in_depth == 1 ||
641                       (use_cudnn_grouped_conv_ &&
642                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
643                                                   /*filter_cols=*/filter_cols,
644                                                   /*in_depth=*/in_depth,
645                                                   /*out_depth=*/out_depth)));
646 
647     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
648             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
649             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
650             << filter_cols << ", " << in_depth << ", " << depth_multiplier
651             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
652             << ", " << out_depth << "], stride = " << stride_
653             << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
654             << ", Use cuDNN: " << use_cudnn;
655 
656     if (use_cudnn) {
657       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
658       //
659       //                  | TensorFlow       | cuDNN
660       // --------------------------------------------------------------------
661       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
662       // filter_in_depth  | in_depth         | in_depth / group_count
663       //
664       // For depthwise convolution, we have group_count == in_depth.
665       int32 filter_in_depth = 1;
666       TensorShape shape =
667           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
668       Tensor reshaped_filter(/*type=*/dtype_);
669       OP_REQUIRES(
670           context, reshaped_filter.CopyFrom(filter, shape),
671           errors::Internal(
672               "Failed to reshape filter tensor for grouped convolution."));
673       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
674       // conv is supported.
675       launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
676                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
677                 stride_, stride_, padding_, explicit_paddings_, in_backprop,
678                 data_format_);
679       return;
680     }
681 
682     auto out_backprop_ptr = out_backprop.template flat<T>().data();
683     auto filter_ptr = filter.template flat<T>().data();
684     auto in_backprop_ptr = in_backprop->template flat<T>().data();
685     LaunchDepthwiseConvBackpropInputOp<Device, T>()(
686         context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
687         data_format_);
688   }
689 
690  protected:
691   bool use_cudnn_grouped_conv_;
692 
693  private:
694   std::vector<int32> strides_;
695   Padding padding_;
696   std::vector<int64> explicit_paddings_;
697   TensorFormat data_format_;
698   int64 stride_;
699 
700   // For in_depth == 1 and grouped convolutions.
701   LaunchConv2DBackpropInputOp<Device, T> launcher_;
702   bool cudnn_use_autotune_;
703   DataType dtype_;
704 
705   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
706 };
707 
708 #define REGISTER_CPU_KERNEL(T)                                       \
709   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
710                               .Device(DEVICE_CPU)                    \
711                               .TypeConstraint<T>("T"),               \
712                           DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
713 
714 TF_CALL_half(REGISTER_CPU_KERNEL);
715 TF_CALL_float(REGISTER_CPU_KERNEL);
716 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
717 TF_CALL_double(REGISTER_CPU_KERNEL);
718 #endif
719 #undef REGISTER_CPU_KERNEL
720 
721 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
722 
723 #define REGISTER_GPU_KERNEL(T)                                       \
724   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
725                               .Device(DEVICE_GPU)                    \
726                               .TypeConstraint<T>("T")                \
727                               .HostMemory("input_sizes"),            \
728                           DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
729 
730 TF_CALL_half(REGISTER_GPU_KERNEL);
731 TF_CALL_float(REGISTER_GPU_KERNEL);
732 TF_CALL_double(REGISTER_GPU_KERNEL);
733 #undef REGISTER_GPU_KERNEL
734 
735 #if CUDNN_VERSION >= 7000
736 template <typename T>
737 class DepthwiseConv2dGroupedConvBackpropInputOp
738     : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
739  public:
DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction * context)740   DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
741       : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
742     this->use_cudnn_grouped_conv_ = true;
743   }
744 };
745 
746 #define REGISTER_GROUPED_CONV_KERNEL(T)                              \
747   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
748                               .Device(DEVICE_GPU)                    \
749                               .TypeConstraint<T>("T")                \
750                               .HostMemory("input_sizes")             \
751                               .Label("cudnn_grouped_convolution"),   \
752                           DepthwiseConv2dGroupedConvBackpropInputOp<T>)
753 
754 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
755 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
756 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
757 #undef REGISTER_GROUPED_CONV_KERNEL
758 #endif  // CUDNN_VERSION
759 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
760 
761 // Kernels to compute the gradients of the filters for depthwise convolution.
762 
763 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
764 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
765 //
766 // EX:
767 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
768 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
769 //
770 //   'input_buffer' [rows, cols, in_depth, depth_multiplier]
771 //
772 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
773 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
774 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
775 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
776 //
777 //   'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
778 //
779 //     [q00, q01, q10, q11] [q20, q21, r00, r01]
780 //     [r10, r11, r20, r21] [s00, s01, s10, s11]
781 //     [s20, s21, t00, t01] [t10, t11, t20, a21]
782 //
783 //   First output register of 'filter_backprop'
784 //     [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
785 //
786 template <typename T>
ComputeBackpropFilter(const DepthwiseArgs & args,const int64 padded_out_depth_size,const int64 out_r,const int64 out_c,const T * out_backprop,const T * input_buffer,T * output_buffer)787 static void ComputeBackpropFilter(const DepthwiseArgs& args,
788                                   const int64 padded_out_depth_size,
789                                   const int64 out_r, const int64 out_c,
790                                   const T* out_backprop, const T* input_buffer,
791                                   T* output_buffer) {
792   typedef typename Eigen::internal::packet_traits<T>::type Packet;
793   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
794   // Calculate vectorized size of 'padded_out_depth_size'.
795   const int64 out_depth = args.out_depth;
796   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
797   const int64 output_vectorized_size =
798       (padded_out_depth_size / kPacketSize) * kPacketSize;
799   const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth;
800   // Determine whether we can execute fast or slow code path.
801   const int64 output_image_size =
802       args.out_rows * args.out_cols * args.out_depth;
803   const int64 output_last_vector_index =
804       output_image_size - (filter_spatial_size * padded_out_depth_size);
805   const bool fast_path = base_output_index <= output_last_vector_index;
806 
807   if (fast_path) {
808     // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
809     // amortize the cost of 'output_buffer' load store in the loop below.
810     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
811       // Load vector register from 'out_backprop'.
812       const auto out_bprop_block =
813           Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
814       for (int j = 0; j < filter_spatial_size; ++j) {
815         const int64 index = i + j * padded_out_depth_size;
816         // Load vector register from 'input_buffer'.
817         const auto input_block =
818             Eigen::internal::ploadu<Packet>(input_buffer + index);
819         // Load output block into vector register.
820         auto out_block_data = output_buffer + index;
821         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
822         // Vector multiply-add.
823         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
824                                                    out_block);
825         // Store 'out_block' back to memory.
826         Eigen::internal::pstoreu<T>(out_block_data, out_block);
827       }
828     }
829   } else {
830     // Slow path (cant do vector reads from non-padded 'out_backprop'.
831     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
832       // Calculate safe read size from 'out_backprop'.
833       const int64 out_bprop_index = base_output_index + i;
834       const int64 out_bprop_limit =
835           std::min(output_image_size, out_bprop_index + kPacketSize);
836       T out_buf[kPacketSize];
837       memset(&out_buf, 0, kPacketSize * sizeof(T));
838       const int64 scalar_size = out_bprop_limit - out_bprop_index;
839       for (int64 j = 0; j < scalar_size; ++j) {
840         out_buf[j] = out_backprop[out_bprop_index + j];
841       }
842       // Load vector register from 'out_buf'.
843       const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
844       for (int j = 0; j < filter_spatial_size; ++j) {
845         const int64 index = i + j * padded_out_depth_size;
846         // Load vector register from 'input_buffer'.
847         const auto input_block =
848             Eigen::internal::ploadu<Packet>(input_buffer + index);
849         // Load output block into vector register.
850         auto out_block_data = output_buffer + index;
851         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
852         // Vector multiply-add.
853         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
854                                                    out_block);
855         // Store 'out_block' back to memory.
856         Eigen::internal::pstoreu<T>(out_block_data, out_block);
857       }
858     }
859   }
860 }
861 
862 template <typename Device, typename T>
863 struct LaunchDepthwiseConvBackpropFilterOp;
864 
865 template <typename T>
866 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
867   typedef typename Eigen::internal::packet_traits<T>::type Packet;
868 
operator ()tensorflow::LaunchDepthwiseConvBackpropFilterOp869   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
870                   const T* out_backprop, const T* input, T* filter_backprop,
871                   TensorFormat data_format) {
872     OP_REQUIRES(
873         ctx, data_format == FORMAT_NHWC,
874         errors::Unimplemented(
875             "Depthwise convolution on CPU is only supported for NHWC format"));
876 
877     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
878 
879     const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
880     const int64 padded_out_depth_size =
881         ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
882 
883     // Allocate output buffers for each image in 'batch' (padded to vector
884     // register boundaries).
885     Tensor output_buffer;
886     OP_REQUIRES_OK(
887         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
888                                 TensorShape({args.batch, filter_spatial_size,
889                                              padded_out_depth_size}),
890                                 &output_buffer));
891     T* output_buffer_data = output_buffer.template flat<T>().data();
892 
893     // Computes one shard of depthwise conv2d backprop filter.
894     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
895                      int64 start, int64 limit) {
896       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
897       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
898       const int64 padded_out_depth_size =
899           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
900 
901       // Allocate buffer for local input regions.
902       Tensor input_buffer;
903       OP_REQUIRES_OK(
904           ctx, ctx->allocate_temp(
905                    DataTypeToEnum<T>::value,
906                    TensorShape({filter_spatial_size, padded_out_depth_size}),
907                    &input_buffer));
908       T* input_buffer_data = input_buffer.template flat<T>().data();
909 
910       const int64 input_image_size =
911           args.in_rows * args.in_cols * args.in_depth;
912       const int64 output_image_size =
913           args.out_rows * args.out_cols * args.out_depth;
914       const int64 padded_filter_size =
915           filter_spatial_size * padded_out_depth_size;
916 
917       for (int b = start; b < limit; ++b) {
918         // Initialize 'output_buffer' for 'b'.
919         auto* output_buffer = output_buffer_data + b * padded_filter_size;
920         memset(output_buffer, 0, padded_filter_size * sizeof(T));
921 
922         for (int out_r = 0; out_r < args.out_rows; ++out_r) {
923           for (int out_c = 0; out_c < args.out_cols; ++out_c) {
924             // Populate 'input_buffer_data' with data from local input region.
925             functor::DepthwiseInputCopyOp<T>()(
926                 args, padded_out_depth_size, out_r, out_c,
927                 input + b * input_image_size, input_buffer_data);
928             // Compute depthwise backprop filter.
929             ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
930                                   out_backprop + b * output_image_size,
931                                   input_buffer_data, output_buffer);
932           }
933         }
934       }
935     };
936     const int64 shard_cost = args.out_rows * args.out_cols * args.out_depth;
937     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
938     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
939           shard_cost, shard);
940 
941     // Accumulate 'output_buffer' from each shard into 'output'.
942     const int64 out_depth = args.out_depth;
943     const int64 vectorized_size = (out_depth / kPacketSize) * kPacketSize;
944     const int64 scalar_size = out_depth - vectorized_size;
945     const int64 padded_filter_size =
946         filter_spatial_size * padded_out_depth_size;
947     memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
948 
949     for (int64 i = 0; i < filter_spatial_size; ++i) {
950       const int64 buffer_base = i * padded_out_depth_size;
951       const int64 output_base = i * out_depth;
952       // Write vectorized length of filter's inner dimension to output.
953       for (int64 j = 0; j < vectorized_size; j += kPacketSize) {
954         // Load data from 'filter_backprop' into vector register.
955         auto out_block_data = filter_backprop + output_base + j;
956         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
957         for (int b = 0; b < args.batch; ++b) {
958           // Load data from 'output_buffer' for 'b'.
959           const auto* output_buffer =
960               output_buffer_data + b * padded_filter_size;
961           const auto v =
962               Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
963           // Add 'v' to 'out_block'.
964           out_block = Eigen::internal::padd<Packet>(out_block, v);
965         }
966         // Store 'out_block' back to memory.
967         Eigen::internal::pstoreu<T>(out_block_data, out_block);
968       }
969       // Write scalar length of filter's inner dimension to output.
970       for (int64 j = 0; j < scalar_size; ++j) {
971         for (int b = 0; b < args.batch; ++b) {
972           const auto* output_buffer =
973               output_buffer_data + b * padded_filter_size;
974           filter_backprop[output_base + vectorized_size + j] +=
975               output_buffer[buffer_base + vectorized_size + j];
976         }
977       }
978     }
979   }
980 };
981 
982 template <typename T>
DepthwiseConvBackpropFilterReference(const DepthwiseArgs & args,const T * out_backprop,const T * input,T * filter_backprop)983 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
984                                                  const T* out_backprop,
985                                                  const T* input,
986                                                  T* filter_backprop) {
987   int num_filter_backprop = args.filter_rows * args.filter_cols *
988                             args.in_depth * args.depth_multiplier;
989   memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
990   // Naive for loop as a reference point without concerns about performance.
991   for (int b = 0; b < args.batch; ++b) {
992     for (int out_r = 0; out_r < args.out_rows; ++out_r) {
993       for (int out_c = 0; out_c < args.out_cols; ++out_c) {
994         for (int out_d = 0; out_d < args.out_depth; ++out_d) {
995           const int in_d = out_d / args.depth_multiplier;
996           const int dm = out_d % args.depth_multiplier;
997           const int in_r_start = out_r * args.stride - args.pad_rows;
998           const int in_c_start = out_c * args.stride - args.pad_cols;
999 
1000           for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
1001             for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
1002               const int in_r = in_r_start + f_r;
1003               const int in_c = in_c_start + f_c;
1004 
1005               if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
1006                   in_c < args.in_cols) {
1007                 int out_backprop_offset =
1008                     out_d +
1009                     args.out_depth *
1010                         (out_c + args.out_cols * (out_r + args.out_rows * b));
1011                 int input_offset =
1012                     in_d +
1013                     args.in_depth *
1014                         (in_c + args.in_cols * (in_r + args.in_rows * b));
1015                 int filter_backprop_offset =
1016                     dm +
1017                     args.depth_multiplier *
1018                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
1019                 filter_backprop[filter_backprop_offset] +=
1020                     input[input_offset] * out_backprop[out_backprop_offset];
1021               }
1022             }
1023           }
1024         }
1025       }
1026     }
1027   }
1028 }
1029 
1030 // Extern template instantiated in conv_grad_filter_ops.cc.
1031 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
1032 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
1033 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
1034 
1035 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1036 
1037 // Extern template instantiated in conv_grad_filter_ops.cc.
1038 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
1039 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
1040 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1041 
1042 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1043 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1044                                                            Eigen::half>;
1045 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1046 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1047 
1048 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1049 
1050 // Kernel to compute the filter backprop for depthwise convolution.
1051 template <typename Device, class T>
1052 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1053  public:
DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction * context)1054   explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1055       : OpKernel(context) {
1056     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1057     OP_REQUIRES(context, strides_.size() == 4,
1058                 errors::InvalidArgument("Sliding window strides field must "
1059                                         "specify 4 dimensions"));
1060 
1061     string data_format;
1062     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1063     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1064                 errors::InvalidArgument("Invalid data format"));
1065 
1066     stride_ = GetTensorDim(strides_, data_format_, 'H');
1067     const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
1068     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
1069     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
1070 
1071     OP_REQUIRES(context, stride_ == stride_w,
1072                 errors::InvalidArgument(
1073                     "Current implementation only supports equal length "
1074                     "strides in the row and column dimensions."));
1075     OP_REQUIRES(
1076         context, (stride_n == 1 && stride_c == 1),
1077         errors::InvalidArgument("Current implementation does not yet support "
1078                                 "strides in the batch and depth dimensions."));
1079     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1080     OP_REQUIRES_OK(context,
1081                    context->GetAttr("explicit_paddings", &explicit_paddings_));
1082     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
1083                                               /*num_dims=*/4, data_format_));
1084 
1085     cudnn_use_autotune_ = CudnnUseAutotune();
1086 
1087     if (std::is_same<T, Eigen::half>::value) {
1088       dtype_ = DT_HALF;
1089     } else if (std::is_same<T, float>::value) {
1090       dtype_ = DT_FLOAT;
1091     } else if (std::is_same<T, double>::value) {
1092       dtype_ = DT_DOUBLE;
1093     } else {
1094       LOG(ERROR) << "Only half, float, and double are supported.";
1095     }
1096     // Use CuDNN grouped conv (filter gradients) when input/output is
1097     // float16(half). See cudnn release note 7.6.3. (https://docs.nvidia.com/dee
1098     // plearning/sdk/cudnn-release-notes/rel_763.html#rel_763)
1099 #if CUDNN_VERSION >= 7603
1100     use_cudnn_grouped_conv_ = dtype_ == DT_HALF;
1101 #else
1102     use_cudnn_grouped_conv_ = false;
1103 #endif
1104   }
1105 
Compute(OpKernelContext * context)1106   void Compute(OpKernelContext* context) override {
1107     const Tensor& input = context->input(0);
1108     const Tensor& filter_sizes = context->input(1);
1109     OP_REQUIRES(
1110         context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1111         errors::InvalidArgument(
1112             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1113             filter_sizes.dims()));
1114     TensorShape filter_shape;
1115     const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1116     for (int i = 0; i < filter_sizes.NumElements(); ++i) {
1117       OP_REQUIRES(context, filter_sizes_data[i] >= 0,
1118                   errors::InvalidArgument("Dimension ", i,
1119                                           " of filter_sizes must be >= 0"));
1120       filter_shape.AddDim(filter_sizes_data[i]);
1121     }
1122     const TensorShape& input_shape = input.shape();
1123 
1124     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1125     Tensor* filter_backprop = nullptr;
1126     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1127                                 {1}, 0, filter_shape, &filter_backprop));
1128 
1129     // If there is nothing to compute, return.
1130     if (out_backprop.shape().num_elements() == 0) {
1131       return;
1132     }
1133 
1134     // If in_depth==1, this operation is just a standard convolution.
1135     // Depthwise convolution is a special case of cuDNN's grouped convolution.
1136     bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
1137                      (in_depth == 1 ||
1138                       (use_cudnn_grouped_conv_ &&
1139                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
1140                                                   /*filter_cols=*/filter_cols,
1141                                                   /*in_depth=*/in_depth,
1142                                                   /*out_depth=*/out_depth)));
1143 
1144     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
1145             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
1146             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1147             << filter_cols << ", " << in_depth << ", " << depth_multiplier
1148             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1149             << ", " << out_depth << "], stride = " << stride_
1150             << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
1151             << ", Use cuDNN: " << use_cudnn;
1152 
1153     if (use_cudnn) {
1154       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1155       //
1156       //                  | TensorFlow       | cuDNN
1157       // --------------------------------------------------------------------
1158       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
1159       // filter_in_depth  | in_depth         | in_depth / group_count
1160       //
1161       // For depthwise convolution, we have group_count == in_depth.
1162       int32 filter_in_depth = 1;
1163       TensorShape shape =
1164           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
1165       Tensor reshaped_filter(/*type=*/dtype_);
1166       OP_REQUIRES(
1167           context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1168           errors::Internal(
1169               "Failed to reshape filter tensor for grouped convolution."));
1170 
1171       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1172       // conv is supported.
1173       launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
1174                 input,
1175                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
1176                 padding_, explicit_paddings_, &reshaped_filter, data_format_);
1177       return;
1178     }
1179 
1180     // For GPU inputs with type half, we cast inputs to float and outputs back
1181     // to half, as half implementation is slow and does not use full precision
1182     // accumulation in some cases.
1183     constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
1184                                    std::is_same<Device, GPUDevice>::value;
1185     using U = typename std::conditional<cast_to_float, float, T>::type;
1186     Tensor casted_out_backprop = out_backprop;
1187     Tensor casted_input = input;
1188     Tensor casted_filter_backprop = *filter_backprop;
1189     const Device& device = context->template eigen_device<Device>();
1190     if (cast_to_float) {
1191       functor::CastFunctor<Device, float, Eigen::half> cast;
1192       OP_REQUIRES_OK(context,
1193                      context->allocate_temp(DT_FLOAT, out_backprop.shape(),
1194                                             &casted_out_backprop));
1195       cast(device, casted_out_backprop.template flat<float>(),
1196            out_backprop.template flat<Eigen::half>());
1197       OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
1198                                                      &casted_input));
1199       cast(device, casted_input.template flat<float>(),
1200            input.template flat<Eigen::half>());
1201       OP_REQUIRES_OK(context,
1202                      context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
1203                                             &casted_filter_backprop));
1204     }
1205 
1206     auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
1207     auto input_ptr = casted_input.template flat<U>().data();
1208     auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
1209     LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
1210         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1211         data_format_);
1212 
1213     if (cast_to_float) {
1214       functor::CastFunctor<Device, Eigen::half, float> cast;
1215       const Tensor& casted_filter_backprop_const = casted_filter_backprop;
1216       cast(device, filter_backprop->template flat<Eigen::half>(),
1217            casted_filter_backprop_const.template flat<float>());
1218     }
1219   }
1220 
1221  protected:
1222   bool use_cudnn_grouped_conv_;
1223 
1224  private:
1225   std::vector<int32> strides_;
1226   Padding padding_;
1227   std::vector<int64> explicit_paddings_;
1228   TensorFormat data_format_;
1229   int64 stride_;
1230 
1231   // For in_depth == 1 and grouped convolutions.
1232   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1233   bool cudnn_use_autotune_;
1234   DataType dtype_;
1235 
1236   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1237 };
1238 
1239 #define REGISTER_CPU_KERNEL(T)                    \
1240   REGISTER_KERNEL_BUILDER(                        \
1241       Name("DepthwiseConv2dNativeBackpropFilter") \
1242           .Device(DEVICE_CPU)                     \
1243           .TypeConstraint<T>("T"),                \
1244       DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1245 TF_CALL_half(REGISTER_CPU_KERNEL);
1246 TF_CALL_float(REGISTER_CPU_KERNEL);
1247 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
1248 TF_CALL_double(REGISTER_CPU_KERNEL);
1249 #endif
1250 #undef REGISTER_CPU_KERNEL
1251 
1252 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1253 #define REGISTER_GPU_KERNEL(T)                                        \
1254   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1255                               .Device(DEVICE_GPU)                     \
1256                               .TypeConstraint<T>("T")                 \
1257                               .HostMemory("filter_sizes"),            \
1258                           DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1259 
1260 TF_CALL_half(REGISTER_GPU_KERNEL);
1261 TF_CALL_float(REGISTER_GPU_KERNEL);
1262 TF_CALL_double(REGISTER_GPU_KERNEL);
1263 #undef REGISTER_GPU_KERNEL
1264 
1265 #if CUDNN_VERSION >= 7000
1266 template <typename T>
1267 class DepthwiseConv2dGroupedConvBackpropFilterOp
1268     : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1269  public:
DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction * context)1270   DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1271       : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1272     this->use_cudnn_grouped_conv_ = true;
1273   }
1274 };
1275 
1276 #define REGISTER_GROUPED_CONV_KERNEL(T)                               \
1277   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1278                               .Device(DEVICE_GPU)                     \
1279                               .TypeConstraint<T>("T")                 \
1280                               .HostMemory("filter_sizes")             \
1281                               .Label("cudnn_grouped_convolution"),    \
1282                           DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1283 
1284 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1285 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1286 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1287 #undef REGISTER_GROUPED_CONV_KERNEL
1288 #endif  // CUDNN_VERSION
1289 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1290 
1291 }  // namespace tensorflow
1292