• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define EIGEN_USE_THREADS
17 
18 #include <algorithm>
19 #include <cmath>
20 
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/kernel_shape_util.h"
23 #include "tensorflow/core/framework/numeric_op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_types.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/kernels/cast_op.h"
31 #include "tensorflow/core/kernels/conv_grad_ops.h"
32 #include "tensorflow/core/kernels/depthwise_conv_op.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/platform/logging.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/padding.h"
37 #include "tensorflow/core/util/tensor_format.h"
38 #include "tensorflow/core/util/use_cudnn.h"
39 #include "tensorflow/core/util/work_sharder.h"
40 
41 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
42 
43 #if GOOGLE_CUDA
44 #include "third_party/gpus/cudnn/cudnn.h"
45 #endif
46 
47 #include "tensorflow/core/platform/stream_executor.h"
48 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
49 
50 namespace tensorflow {
51 
52 // Gradient operations for depthwise convolution.
53 
54 typedef Eigen::ThreadPoolDevice CPUDevice;
55 typedef Eigen::GpuDevice GPUDevice;
56 
57 // Common code between the two backward pass kernels: verifies that the
58 // dimensions all match and extract the padded rows and columns.
59 #define EXTRACT_AND_VERIFY_DIMENSIONS(label)                                   \
60   const Tensor& out_backprop = context->input(2);                              \
61   OP_REQUIRES(                                                                 \
62       context, input_shape.dims() == 4,                                        \
63       errors::InvalidArgument(label, ": input must be 4-dimensional"));        \
64   OP_REQUIRES(                                                                 \
65       context, filter_shape.dims() == 4,                                       \
66       errors::InvalidArgument(label, ": filter must be 4-dimensional"));       \
67   OP_REQUIRES(                                                                 \
68       context, out_backprop.dims() == 4,                                       \
69       errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
70   const int64 batch = input_shape.dim_size(0);                                 \
71   OP_REQUIRES(                                                                 \
72       context, batch == out_backprop.dim_size(0),                              \
73       errors::InvalidArgument(                                                 \
74           label, ": input and out_backprop must have the same batch size"));   \
75   const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');   \
76   OP_REQUIRES(                                                                 \
77       context,                                                                 \
78       FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),      \
79       errors::InvalidArgument("Input rows too large"));                        \
80   const int32 input_rows = static_cast<int32>(input_rows_raw);                 \
81   const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');   \
82   OP_REQUIRES(                                                                 \
83       context,                                                                 \
84       FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),      \
85       errors::InvalidArgument("Input cols too large"));                        \
86   const int32 input_cols = static_cast<int32>(input_cols_raw);                 \
87   const int64 filter_rows = filter_shape.dim_size(0);                          \
88   const int64 filter_cols = filter_shape.dim_size(1);                          \
89   const int64 output_rows_raw =                                                \
90       GetTensorDim(out_backprop.shape(), data_format_, 'H');                   \
91   OP_REQUIRES(                                                                 \
92       context,                                                                 \
93       FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()),     \
94       errors::InvalidArgument("Output rows too large"));                       \
95   const int32 output_rows = static_cast<int32>(output_rows_raw);               \
96   const int64 output_cols_raw =                                                \
97       GetTensorDim(out_backprop.shape(), data_format_, 'W');                   \
98   OP_REQUIRES(                                                                 \
99       context,                                                                 \
100       FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()),     \
101       errors::InvalidArgument("Output cols too large"));                       \
102   const int32 output_cols = static_cast<int32>(output_cols_raw);               \
103   const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');         \
104   OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
105               errors::InvalidArgument(                                         \
106                   label, ": input and filter must have the same in_depth"));   \
107   const int64 depth_multiplier = filter_shape.dim_size(3);                     \
108   const int64 out_depth_raw =                                                  \
109       GetTensorDim(out_backprop.shape(), data_format_, 'C');                   \
110   OP_REQUIRES(                                                                 \
111       context,                                                                 \
112       FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()),       \
113       errors::InvalidArgument("Output depth too large"));                      \
114   const int32 out_depth = static_cast<int32>(out_depth_raw);                   \
115   OP_REQUIRES(                                                                 \
116       context, (depth_multiplier * in_depth) == out_depth,                     \
117       errors::InvalidArgument(                                                 \
118           label, ": depth_multiplier * in_depth not equal to out_depth"));     \
119   const auto stride = stride_;                                                 \
120   int64 out_rows = 0, out_cols = 0, pad_top = 0, pad_bottom = 0, pad_left = 0, \
121         pad_right = 0;                                                         \
122   if (padding_ == Padding::EXPLICIT) {                                         \
123     GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H', &pad_top,  \
124                              &pad_bottom);                                     \
125     GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W', &pad_left, \
126                              &pad_right);                                      \
127   }                                                                            \
128   OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(                        \
129                               input_rows, filter_rows, stride_, padding_,      \
130                               &out_rows, &pad_top, &pad_bottom));              \
131   OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(                        \
132                               input_cols, filter_cols, stride_, padding_,      \
133                               &out_cols, &pad_left, &pad_right));              \
134   OP_REQUIRES(                                                                 \
135       context, output_rows == out_rows,                                        \
136       errors::InvalidArgument(                                                 \
137           label, ": Number of rows of out_backprop doesn't match computed: ",  \
138           "actual = ", output_rows, ", computed = ", out_rows));               \
139   OP_REQUIRES(                                                                 \
140       context, output_cols == out_cols,                                        \
141       errors::InvalidArgument(                                                 \
142           label, ": Number of cols of out_backprop doesn't match computed: ",  \
143           "actual = ", output_cols, ", computed = ", out_cols));               \
144   DepthwiseArgs args;                                                          \
145   args.batch = batch;                                                          \
146   args.in_rows = input_rows;                                                   \
147   args.in_cols = input_cols;                                                   \
148   args.in_depth = in_depth;                                                    \
149   args.filter_rows = filter_rows;                                              \
150   args.filter_cols = filter_cols;                                              \
151   args.depth_multiplier = depth_multiplier;                                    \
152   args.stride = stride;                                                        \
153   args.pad_rows = pad_top;                                                     \
154   args.pad_cols = pad_left;                                                    \
155   args.out_rows = out_rows;                                                    \
156   args.out_cols = out_cols;                                                    \
157   args.out_depth = out_depth;                                                  \
158   VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", "      \
159           << input_rows << ", " << input_cols << ", " << in_depth              \
160           << "]; Filter: [" << filter_rows << ", " << filter_cols << ", "      \
161           << in_depth << ", " << depth_multiplier << "]; stride = " << stride  \
162           << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left         \
163           << ", output: [" << batch << ", " << out_rows << ", " << out_cols    \
164           << ", " << out_depth << "]";
165 
166 // Copies data from local region in 'out_backprop' into 'buffer'.
167 // The local region coordinates are calculated as the set of output points which
168 // used the input point ('in_r', 'in_'c') as input during the forward pass.
169 // Rather than spatially reversing the filter, the input is reversed during
170 // the copy. The copied data is padded to vector register-width boundaries so
171 // that it is aligned for efficient traversal and vector multiply-add by the
172 // depthwise input kernel.
173 //
174 // EX:
175 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
176 //
177 //   'out_backprop': [batch, out_rows, out_cols, out_depth]
178 //
179 //     [a00, a01, a10, a11] [a20, a21, b00, b01]
180 //     [b10, b11, b20, b21] [...]
181 //     [e00, e01, e10, e11] [e20, e21, f00, f01]
182 //     [f10, f11, f20, f21] [...]
183 //
184 //   'buffer' (register boundaries shown):
185 //
186 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
187 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
188 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
189 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
190 //
191 template <typename T>
CopyOutputBackpropRegion(const DepthwiseArgs & args,const int64_t padded_filter_inner_dim_size,const int64_t in_r,const int64_t in_c,const T * out_backprop,T * buffer)192 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
193                                      const int64_t padded_filter_inner_dim_size,
194                                      const int64_t in_r, const int64_t in_c,
195                                      const T* out_backprop, T* buffer) {
196   typedef typename Eigen::internal::packet_traits<T>::type Packet;
197   static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
198 
199   const int64_t stride = args.stride;
200   const int64_t filter_rows = args.filter_rows;
201   const int64_t filter_cols = args.filter_cols;
202   const int64_t pad_rows = args.pad_rows;
203   const int64_t pad_cols = args.pad_cols;
204   const int64_t out_rows = args.out_rows;
205   const int64_t out_cols = args.out_cols;
206 
207   // Calculate the output spatial region which used point (in_r, in_c) as input.
208   const int64_t out_r_start = std::max(
209       static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride);
210   const int64_t out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
211   const int64_t out_c_start = std::max(
212       static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride);
213   const int64_t out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
214 
215   // Zero-pad 'buffer' if output region is smaller than filter spatial size.
216   const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
217   if ((out_r_end - out_r_start + 1) < args.filter_rows ||
218       (out_c_end - out_c_start + 1) < args.filter_cols) {
219     memset(buffer, 0,
220            filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
221   }
222 
223   // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
224   const int64_t vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
225   const int64_t scalar_size = args.out_depth % kPacketSize;
226   const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
227 
228   for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
229     const int64_t f_r = in_r + pad_rows - out_r * stride;
230     for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
231       const int64_t f_c = in_c + pad_cols - out_c * stride;
232       const int64_t buf_base =
233           (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
234       // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
235       auto* out_bprop =
236           out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
237 
238       // Copy vectorized portion of inner dimension into 'buffer'.
239       for (int64_t d = 0; d < vectorized_size; d += kPacketSize) {
240         auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
241         Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
242       }
243       // Copy scalar portion of out_bprop to 'buffer'
244       for (int64_t d = 0; d < scalar_size; ++d) {
245         buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
246       }
247       // Pad to vector-register width (if needed).
248       for (int64_t d = 0; d < pad_size; ++d) {
249         buffer[buf_base + vectorized_size + scalar_size + d] =
250             static_cast<T>(0);
251       }
252     }
253   }
254 }
255 
256 // Computes the vectorized product of 'buffer' and 'filter' and stores
257 // result in 'output' at location computed from 'in_r' and 'in_c'.
258 // If depth_multiplier is > 1, the intermediate output is reduced along
259 // the depth_multiplier dimension.
260 //
261 // EX:
262 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
263 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
264 //
265 //   'buffer' [rows, cols, in_depth, depth_multiplier]
266 //
267 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
268 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
269 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
270 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
271 //
272 //   filter [rows, cols, in_depth, depth_multiplier]
273 //     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
274 //     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
275 //
276 //   First output register [in_depth, depth_multiplier]
277 //     [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
278 //                            ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
279 //                            ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
280 //                            ([a00, a01, a10, a11] x [u3, v3, w3, x3])
281 //
282 //   Reduction step along depth-multiplier dimension:
283 //
284 //     [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
285 //
286 
287 template <typename T>
ComputeBackpropInput(const DepthwiseArgs & args,const int64_t padded_filter_inner_dim_size,const int64_t in_r,const int64_t in_c,const T * filter,const T * buffer,T * out_buffer,T * output)288 static void ComputeBackpropInput(const DepthwiseArgs& args,
289                                  const int64_t padded_filter_inner_dim_size,
290                                  const int64_t in_r, const int64_t in_c,
291                                  const T* filter, const T* buffer,
292                                  T* out_buffer, T* output) {
293   typedef typename Eigen::internal::packet_traits<T>::type Packet;
294   static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
295 
296   const int64_t in_depth = args.in_depth;
297   const int64_t depth_multiplier = args.depth_multiplier;
298   const int64_t out_depth = args.out_depth;
299   const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
300 
301   // Calculate vectorized and scalar lengths of 'out_depth'.
302   const int64_t output_vectorized_size =
303       (out_depth / kPacketSize) * kPacketSize;
304   const int64_t output_scalar_size = out_depth % kPacketSize;
305 
306   // Calculate base index at which to begin writing output.
307   const int64_t base_output_index = (in_r * args.in_cols + in_c) * in_depth;
308 
309   // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
310   // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
311   const int64_t dm_vectorized_size =
312       (depth_multiplier / kPacketSize) * kPacketSize;
313   const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
314 
315   for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
316     // Reset accumulator.
317     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
318     for (int j = 0; j < filter_spatial_size; ++j) {
319       // Calculate index.
320       const int64_t index = i + j * padded_filter_inner_dim_size;
321       // Load filter.
322       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
323       // Load input.
324       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
325       // Vector multiply-add.
326       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
327     }
328     if (depth_multiplier == 1) {
329       // Write directly to the output.
330       Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
331     } else {
332       // Buffer output for subsequent reduction step.
333       Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
334     }
335   }
336 
337   if (output_scalar_size > 0) {
338     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
339     for (int j = 0; j < filter_spatial_size; ++j) {
340       const int64_t index =
341           output_vectorized_size + j * padded_filter_inner_dim_size;
342       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
343       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
344       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
345     }
346     // Load accumulator into an array and loop through output.
347     T out_buf[kPacketSize];
348     Eigen::internal::pstoreu<T>(out_buf, vaccum);
349     if (depth_multiplier == 1) {
350       // Write directly to the output.
351       for (int j = 0; j < output_scalar_size; ++j) {
352         output[base_output_index + output_vectorized_size + j] = out_buf[j];
353       }
354     } else {
355       // Buffer output for subsequent reduction step.
356       for (int j = 0; j < output_scalar_size; ++j) {
357         out_buffer[output_vectorized_size + j] = out_buf[j];
358       }
359     }
360   }
361 
362   // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
363   if (depth_multiplier > 1) {
364     for (int64_t d = 0; d < in_depth; ++d) {
365       const int64_t index = d * args.depth_multiplier;
366       T accum = static_cast<T>(0);
367       for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
368         const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
369         accum += Eigen::internal::predux(v);
370       }
371       // Copy scalar portion of replicated output.
372       for (int64_t dm = 0; dm < dm_scalar_size; ++dm) {
373         accum += out_buffer[index + dm_vectorized_size + dm];
374       }
375       // Copy to output.
376       output[base_output_index + d] = accum;
377     }
378   }
379 }
380 
381 // Computes the depthwise conv2d backprop input of 'out_backprop' by
382 // 'depthwise_filter' and stores the result in 'in_backprop'.
383 template <typename T>
384 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
385   typedef typename Eigen::internal::packet_traits<T>::type Packet;
386 
operator ()tensorflow::LaunchDepthwiseConvBackpropInputOp387   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
388                   const T* out_backprop, const T* depthwise_filter,
389                   T* in_backprop, TensorFormat data_format) {
390     OP_REQUIRES(
391         ctx, data_format == FORMAT_NHWC,
392         errors::Unimplemented(
393             "Depthwise convolution on CPU is only supported for NHWC format"));
394 
395     static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
396 
397     // Pad 'depthwise_filter' to vector register width (if needed).
398     const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
399     Tensor padded_filter;
400     if (pad_filter) {
401       // Allocate space for padded filter.
402       const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
403       const int64_t padded_filter_inner_dim_size =
404           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
405       OP_REQUIRES_OK(
406           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
407                                   TensorShape({filter_spatial_size,
408                                                padded_filter_inner_dim_size}),
409                                   &padded_filter));
410       // Write out padded filter.
411       functor::DepthwiseFilterPadOp<T>()(
412           args, depthwise_filter, padded_filter.template flat<T>().data());
413     }
414     const T* filter_data =
415         pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
416 
417     // Computes one shard of depthwise conv2d backprop input.
418     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
419                      int64_t start, int64_t limit) {
420       static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
421 
422       const int64_t input_image_size =
423           args.in_rows * args.in_cols * args.in_depth;
424       const int64_t output_image_size =
425           args.out_rows * args.out_cols * args.out_depth;
426       const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
427       const int64_t padded_filter_inner_dim_size =
428           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
429 
430       // Allocate buffer to copy regions from 'out_backprop'.
431       Tensor out_bprop_buffer;
432       OP_REQUIRES_OK(
433           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
434                                   TensorShape({filter_spatial_size,
435                                                padded_filter_inner_dim_size}),
436                                   &out_bprop_buffer));
437       T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
438 
439       // Allocate buffer for intermediate results.
440       Tensor in_bprop_buffer;
441       OP_REQUIRES_OK(
442           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
443                                   TensorShape({padded_filter_inner_dim_size}),
444                                   &in_bprop_buffer));
445       T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
446 
447       for (int64_t b = start; b < limit; ++b) {
448         for (int64_t in_r = 0; in_r < args.in_rows; ++in_r) {
449           for (int64_t in_c = 0; in_c < args.in_cols; ++in_c) {
450             // Populate 'out_bprop_buf' from local 'out_backprop' region.
451             CopyOutputBackpropRegion<T>(
452                 args, padded_filter_inner_dim_size, in_r, in_c,
453                 out_backprop + b * output_image_size, out_bprop_buf);
454 
455             // Compute depthwise backprop input.
456             ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
457                                     in_c, filter_data, out_bprop_buf,
458                                     in_bprop_buf,
459                                     in_backprop + b * input_image_size);
460           }
461         }
462       }
463     };
464 
465     const int64_t shard_cost = args.in_rows * args.in_cols * args.out_depth;
466     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
467     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
468           shard_cost, shard);
469   }
470 };
471 
472 template <typename T>
DepthwiseConvBackpropInputReference(const DepthwiseArgs & args,const T * out_backprop,const T * filter,T * in_backprop)473 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
474                                                 const T* out_backprop,
475                                                 const T* filter,
476                                                 T* in_backprop) {
477   // Naive for loop as a reference point without concerns about performance.
478   for (int b = 0; b < args.batch; ++b) {
479     for (int in_r = 0; in_r < args.in_rows; ++in_r) {
480       for (int in_c = 0; in_c < args.in_cols; ++in_c) {
481         for (int in_d = 0; in_d < args.in_depth; ++in_d) {
482           T sum = 0;
483           const int stride = args.stride;
484           const int out_d_start = in_d * args.depth_multiplier;
485           const int out_d_end = out_d_start + args.depth_multiplier;
486 
487           for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
488             const int out_r_start = std::max(
489                 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
490             const int out_r_end =
491                 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
492 
493             for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
494               const int out_c_start = std::max(
495                   0,
496                   (in_c - args.filter_cols + args.pad_cols + stride) / stride);
497               const int out_c_end =
498                   std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
499 
500               for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
501                 int f_r = in_r + args.pad_rows - out_r * stride;
502                 int f_c = in_c + args.pad_cols - out_c * stride;
503                 int filter_dm = out_d - out_d_start;
504                 int out_backprop_offset =
505                     out_d +
506                     args.out_depth *
507                         (out_c + args.out_cols * (out_r + args.out_rows * b));
508                 int filter_offset =
509                     filter_dm +
510                     args.depth_multiplier *
511                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
512                 sum +=
513                     out_backprop[out_backprop_offset] * filter[filter_offset];
514               }
515             }
516           }
517 
518           int in_backprop_offset =
519               in_d +
520               args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
521           in_backprop[in_backprop_offset] = sum;
522         }
523       }
524     }
525   }
526 }
527 
528 // Extern template instantiated in conv_grad_input_ops.cc.
529 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
530 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
531 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
532 
533 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
534 
535 // Extern template instantiated in conv_grad_input_ops.cc.
536 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
537 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
538 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
539 
540 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
541 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
542                                                           Eigen::half>;
543 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
544 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
545 
546 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
547 
548 // Kernel to compute the input backprop for depthwise convolution.
549 template <typename Device, class T>
550 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
551  public:
DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction * context)552   explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
553       : OpKernel(context) {
554     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
555     OP_REQUIRES(context, strides_.size() == 4,
556                 errors::InvalidArgument("Sliding window strides field must "
557                                         "specify 4 dimensions"));
558 
559     string data_format;
560     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
561     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
562                 errors::InvalidArgument("Invalid data format"));
563 
564     stride_ = GetTensorDim(strides_, data_format_, 'H');
565     const int64_t stride_w = GetTensorDim(strides_, data_format_, 'W');
566     const int64_t stride_n = GetTensorDim(strides_, data_format_, 'N');
567     const int64_t stride_c = GetTensorDim(strides_, data_format_, 'C');
568 
569     OP_REQUIRES(context, stride_ == stride_w,
570                 errors::InvalidArgument(
571                     "Current implementation only supports equal length "
572                     "strides in the row and column dimensions."));
573     OP_REQUIRES(
574         context, (stride_n == 1 && stride_c == 1),
575         errors::InvalidArgument("Current implementation does not yet support "
576                                 "strides in the batch and depth dimensions."));
577     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
578     OP_REQUIRES_OK(context,
579                    context->GetAttr("explicit_paddings", &explicit_paddings_));
580     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
581                                               /*num_dims=*/4, data_format_));
582 
583     cudnn_use_autotune_ = CudnnUseAutotune();
584     dtype_ = DataTypeToEnum<T>::value;
585 #if CUDNN_VERSION >= 8000
586     // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
587     // NHWC depthwise kernels to support more combinations (filter
588     // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
589     // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
590     // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
591     // release-notes/rel_8.html#rel_8)
592     use_cudnn_grouped_conv_ =
593         dtype_ == DT_HALF &&
594         ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
595          (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
596           (stride_ == 1 || stride_ == 2)));
597 #elif CUDNN_VERSION >= 7603
598     // Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
599     // NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
600     // a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
601     use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
602                               data_format_ == FORMAT_NCHW && stride_ == 1 &&
603                               stride_w == 1;
604 #else
605     use_cudnn_grouped_conv_ = false;
606 #endif
607   }
608 
Compute(OpKernelContext * context)609   void Compute(OpKernelContext* context) override {
610     const Tensor& input_sizes = context->input(0);
611     const Tensor& filter = context->input(1);
612     OP_REQUIRES(
613         context, TensorShapeUtils::IsVector(input_sizes.shape()),
614         errors::InvalidArgument(
615             "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
616             input_sizes.dims()));
617     TensorShape input_shape;
618     const int32* in_sizes_data = input_sizes.template flat<int32>().data();
619 
620     for (int i = 0; i < input_sizes.NumElements(); ++i) {
621       OP_REQUIRES(context, in_sizes_data[i] >= 0,
622                   errors::InvalidArgument("Dimension ", i,
623                                           " of input_sizes must be >= 0"));
624       input_shape.AddDim(in_sizes_data[i]);
625     }
626     const TensorShape& filter_shape = filter.shape();
627     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
628 
629     Tensor* in_backprop = nullptr;
630     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
631                                 {0}, 0, input_shape, &in_backprop));
632 
633     // If there is nothing to compute, return.
634     if (input_shape.num_elements() == 0) {
635       return;
636     }
637 
638     // If in_depth==1, this operation is just a standard convolution.
639     // Depthwise convolution is a special case of cuDNN's grouped convolution.
640     bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
641                      (in_depth == 1 ||
642                       (use_cudnn_grouped_conv_ &&
643                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
644                                                   /*filter_cols=*/filter_cols,
645                                                   /*in_depth=*/in_depth,
646                                                   /*out_depth=*/out_depth)));
647 
648     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
649             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
650             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
651             << filter_cols << ", " << in_depth << ", " << depth_multiplier
652             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
653             << ", " << out_depth << "], stride = " << stride_
654             << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
655             << ", Use cuDNN: " << use_cudnn;
656 
657     if (use_cudnn) {
658       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
659       //
660       //                  | TensorFlow       | cuDNN
661       // --------------------------------------------------------------------
662       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
663       // filter_in_depth  | in_depth         | in_depth / group_count
664       //
665       // For depthwise convolution, we have group_count == in_depth.
666       int32_t filter_in_depth = 1;
667       TensorShape shape =
668           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
669       Tensor reshaped_filter(/*type=*/dtype_);
670       OP_REQUIRES(
671           context, reshaped_filter.CopyFrom(filter, shape),
672           errors::Internal(
673               "Failed to reshape filter tensor for grouped convolution."));
674       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
675       // conv is supported.
676       launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
677                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
678                 stride_, stride_, padding_, explicit_paddings_, in_backprop,
679                 data_format_);
680       return;
681     }
682 
683     auto out_backprop_ptr = out_backprop.template flat<T>().data();
684     auto filter_ptr = filter.template flat<T>().data();
685     auto in_backprop_ptr = in_backprop->template flat<T>().data();
686     LaunchDepthwiseConvBackpropInputOp<Device, T>()(
687         context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
688         data_format_);
689   }
690 
691  protected:
692   bool use_cudnn_grouped_conv_;
693 
694  private:
695   std::vector<int32> strides_;
696   Padding padding_;
697   std::vector<int64> explicit_paddings_;
698   TensorFormat data_format_;
699   int64 stride_;
700 
701   // For in_depth == 1 and grouped convolutions.
702   LaunchConv2DBackpropInputOp<Device, T> launcher_;
703   bool cudnn_use_autotune_;
704   DataType dtype_;
705 
706   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
707 };
708 
709 #define REGISTER_CPU_KERNEL(T)                                       \
710   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
711                               .Device(DEVICE_CPU)                    \
712                               .TypeConstraint<T>("T"),               \
713                           DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
714 
715 TF_CALL_half(REGISTER_CPU_KERNEL);
716 TF_CALL_float(REGISTER_CPU_KERNEL);
717 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
718 TF_CALL_double(REGISTER_CPU_KERNEL);
719 #endif
720 #undef REGISTER_CPU_KERNEL
721 
722 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
723 
724 #define REGISTER_GPU_KERNEL(T)                                       \
725   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
726                               .Device(DEVICE_GPU)                    \
727                               .TypeConstraint<T>("T")                \
728                               .HostMemory("input_sizes"),            \
729                           DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
730 
731 TF_CALL_half(REGISTER_GPU_KERNEL);
732 TF_CALL_float(REGISTER_GPU_KERNEL);
733 TF_CALL_double(REGISTER_GPU_KERNEL);
734 #undef REGISTER_GPU_KERNEL
735 
736 #if CUDNN_VERSION >= 7000
737 template <typename T>
738 class DepthwiseConv2dGroupedConvBackpropInputOp
739     : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
740  public:
DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction * context)741   DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
742       : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
743     this->use_cudnn_grouped_conv_ = true;
744   }
745 };
746 
747 #define REGISTER_GROUPED_CONV_KERNEL(T)                              \
748   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
749                               .Device(DEVICE_GPU)                    \
750                               .TypeConstraint<T>("T")                \
751                               .HostMemory("input_sizes")             \
752                               .Label("cudnn_grouped_convolution"),   \
753                           DepthwiseConv2dGroupedConvBackpropInputOp<T>)
754 
755 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
756 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
757 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
758 #undef REGISTER_GROUPED_CONV_KERNEL
759 #endif  // CUDNN_VERSION
760 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
761 
762 // Kernels to compute the gradients of the filters for depthwise convolution.
763 
764 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
765 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
766 //
767 // EX:
768 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
769 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
770 //
771 //   'input_buffer' [rows, cols, in_depth, depth_multiplier]
772 //
773 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
774 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
775 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
776 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
777 //
778 //   'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
779 //
780 //     [q00, q01, q10, q11] [q20, q21, r00, r01]
781 //     [r10, r11, r20, r21] [s00, s01, s10, s11]
782 //     [s20, s21, t00, t01] [t10, t11, t20, a21]
783 //
784 //   First output register of 'filter_backprop'
785 //     [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
786 //
787 template <typename T>
ComputeBackpropFilter(const DepthwiseArgs & args,const int64_t padded_out_depth_size,const int64_t out_r,const int64_t out_c,const T * out_backprop,const T * input_buffer,T * output_buffer)788 static void ComputeBackpropFilter(const DepthwiseArgs& args,
789                                   const int64_t padded_out_depth_size,
790                                   const int64_t out_r, const int64_t out_c,
791                                   const T* out_backprop, const T* input_buffer,
792                                   T* output_buffer) {
793   typedef typename Eigen::internal::packet_traits<T>::type Packet;
794   static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
795   // Calculate vectorized size of 'padded_out_depth_size'.
796   const int64_t out_depth = args.out_depth;
797   const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
798   const int64_t output_vectorized_size =
799       (padded_out_depth_size / kPacketSize) * kPacketSize;
800   const int64_t base_output_index = (out_r * args.out_cols + out_c) * out_depth;
801   // Determine whether we can execute fast or slow code path.
802   const int64_t output_image_size =
803       args.out_rows * args.out_cols * args.out_depth;
804   const int64_t output_last_vector_index =
805       output_image_size - (filter_spatial_size * padded_out_depth_size);
806   const bool fast_path = base_output_index <= output_last_vector_index;
807 
808   if (fast_path) {
809     // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
810     // amortize the cost of 'output_buffer' load store in the loop below.
811     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
812       // Load vector register from 'out_backprop'.
813       const auto out_bprop_block =
814           Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
815       for (int j = 0; j < filter_spatial_size; ++j) {
816         const int64_t index = i + j * padded_out_depth_size;
817         // Load vector register from 'input_buffer'.
818         const auto input_block =
819             Eigen::internal::ploadu<Packet>(input_buffer + index);
820         // Load output block into vector register.
821         auto out_block_data = output_buffer + index;
822         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
823         // Vector multiply-add.
824         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
825                                                    out_block);
826         // Store 'out_block' back to memory.
827         Eigen::internal::pstoreu<T>(out_block_data, out_block);
828       }
829     }
830   } else {
831     // Slow path (cant do vector reads from non-padded 'out_backprop'.
832     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
833       // Calculate safe read size from 'out_backprop'.
834       const int64_t out_bprop_index = base_output_index + i;
835       const int64_t out_bprop_limit =
836           std::min(output_image_size, out_bprop_index + kPacketSize);
837       T out_buf[kPacketSize];
838       memset(&out_buf, 0, kPacketSize * sizeof(T));
839       const int64_t scalar_size = out_bprop_limit - out_bprop_index;
840       for (int64_t j = 0; j < scalar_size; ++j) {
841         out_buf[j] = out_backprop[out_bprop_index + j];
842       }
843       // Load vector register from 'out_buf'.
844       const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
845       for (int j = 0; j < filter_spatial_size; ++j) {
846         const int64_t index = i + j * padded_out_depth_size;
847         // Load vector register from 'input_buffer'.
848         const auto input_block =
849             Eigen::internal::ploadu<Packet>(input_buffer + index);
850         // Load output block into vector register.
851         auto out_block_data = output_buffer + index;
852         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
853         // Vector multiply-add.
854         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
855                                                    out_block);
856         // Store 'out_block' back to memory.
857         Eigen::internal::pstoreu<T>(out_block_data, out_block);
858       }
859     }
860   }
861 }
862 
863 template <typename Device, typename T>
864 struct LaunchDepthwiseConvBackpropFilterOp;
865 
866 template <typename T>
867 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
868   typedef typename Eigen::internal::packet_traits<T>::type Packet;
869 
operator ()tensorflow::LaunchDepthwiseConvBackpropFilterOp870   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
871                   const T* out_backprop, const T* input, T* filter_backprop,
872                   TensorFormat data_format) {
873     OP_REQUIRES(
874         ctx, data_format == FORMAT_NHWC,
875         errors::Unimplemented(
876             "Depthwise convolution on CPU is only supported for NHWC format"));
877 
878     static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
879 
880     const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
881     const int64_t padded_out_depth_size =
882         ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
883 
884     // Allocate output buffers for each image in 'batch' (padded to vector
885     // register boundaries).
886     Tensor output_buffer;
887     OP_REQUIRES_OK(
888         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
889                                 TensorShape({args.batch, filter_spatial_size,
890                                              padded_out_depth_size}),
891                                 &output_buffer));
892     T* output_buffer_data = output_buffer.template flat<T>().data();
893 
894     // Computes one shard of depthwise conv2d backprop filter.
895     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
896                      int64_t start, int64_t limit) {
897       static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
898       const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
899       const int64_t padded_out_depth_size =
900           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
901 
902       // Allocate buffer for local input regions.
903       Tensor input_buffer;
904       OP_REQUIRES_OK(
905           ctx, ctx->allocate_temp(
906                    DataTypeToEnum<T>::value,
907                    TensorShape({filter_spatial_size, padded_out_depth_size}),
908                    &input_buffer));
909       T* input_buffer_data = input_buffer.template flat<T>().data();
910 
911       const int64_t input_image_size =
912           args.in_rows * args.in_cols * args.in_depth;
913       const int64_t output_image_size =
914           args.out_rows * args.out_cols * args.out_depth;
915       const int64_t padded_filter_size =
916           filter_spatial_size * padded_out_depth_size;
917 
918       for (int b = start; b < limit; ++b) {
919         // Initialize 'output_buffer' for 'b'.
920         auto* output_buffer = output_buffer_data + b * padded_filter_size;
921         memset(output_buffer, 0, padded_filter_size * sizeof(T));
922 
923         for (int out_r = 0; out_r < args.out_rows; ++out_r) {
924           for (int out_c = 0; out_c < args.out_cols; ++out_c) {
925             // Populate 'input_buffer_data' with data from local input region.
926             functor::DepthwiseInputCopyOp<T>()(
927                 args, padded_out_depth_size, out_r, out_c,
928                 input + b * input_image_size, input_buffer_data);
929             // Compute depthwise backprop filter.
930             ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
931                                   out_backprop + b * output_image_size,
932                                   input_buffer_data, output_buffer);
933           }
934         }
935       }
936     };
937     const int64_t shard_cost = args.out_rows * args.out_cols * args.out_depth;
938     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
939     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
940           shard_cost, shard);
941 
942     // Accumulate 'output_buffer' from each shard into 'output'.
943     const int64_t out_depth = args.out_depth;
944     const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
945     const int64_t scalar_size = out_depth - vectorized_size;
946     const int64_t padded_filter_size =
947         filter_spatial_size * padded_out_depth_size;
948     memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
949 
950     for (int64_t i = 0; i < filter_spatial_size; ++i) {
951       const int64_t buffer_base = i * padded_out_depth_size;
952       const int64_t output_base = i * out_depth;
953       // Write vectorized length of filter's inner dimension to output.
954       for (int64_t j = 0; j < vectorized_size; j += kPacketSize) {
955         // Load data from 'filter_backprop' into vector register.
956         auto out_block_data = filter_backprop + output_base + j;
957         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
958         for (int b = 0; b < args.batch; ++b) {
959           // Load data from 'output_buffer' for 'b'.
960           const auto* output_buffer =
961               output_buffer_data + b * padded_filter_size;
962           const auto v =
963               Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
964           // Add 'v' to 'out_block'.
965           out_block = Eigen::internal::padd<Packet>(out_block, v);
966         }
967         // Store 'out_block' back to memory.
968         Eigen::internal::pstoreu<T>(out_block_data, out_block);
969       }
970       // Write scalar length of filter's inner dimension to output.
971       for (int64_t j = 0; j < scalar_size; ++j) {
972         for (int b = 0; b < args.batch; ++b) {
973           const auto* output_buffer =
974               output_buffer_data + b * padded_filter_size;
975           filter_backprop[output_base + vectorized_size + j] +=
976               output_buffer[buffer_base + vectorized_size + j];
977         }
978       }
979     }
980   }
981 };
982 
983 template <typename T>
DepthwiseConvBackpropFilterReference(const DepthwiseArgs & args,const T * out_backprop,const T * input,T * filter_backprop)984 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
985                                                  const T* out_backprop,
986                                                  const T* input,
987                                                  T* filter_backprop) {
988   int num_filter_backprop = args.filter_rows * args.filter_cols *
989                             args.in_depth * args.depth_multiplier;
990   memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
991   // Naive for loop as a reference point without concerns about performance.
992   for (int b = 0; b < args.batch; ++b) {
993     for (int out_r = 0; out_r < args.out_rows; ++out_r) {
994       for (int out_c = 0; out_c < args.out_cols; ++out_c) {
995         for (int out_d = 0; out_d < args.out_depth; ++out_d) {
996           const int in_d = out_d / args.depth_multiplier;
997           const int dm = out_d % args.depth_multiplier;
998           const int in_r_start = out_r * args.stride - args.pad_rows;
999           const int in_c_start = out_c * args.stride - args.pad_cols;
1000 
1001           for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
1002             for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
1003               const int in_r = in_r_start + f_r;
1004               const int in_c = in_c_start + f_c;
1005 
1006               if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
1007                   in_c < args.in_cols) {
1008                 int out_backprop_offset =
1009                     out_d +
1010                     args.out_depth *
1011                         (out_c + args.out_cols * (out_r + args.out_rows * b));
1012                 int input_offset =
1013                     in_d +
1014                     args.in_depth *
1015                         (in_c + args.in_cols * (in_r + args.in_rows * b));
1016                 int filter_backprop_offset =
1017                     dm +
1018                     args.depth_multiplier *
1019                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
1020                 filter_backprop[filter_backprop_offset] +=
1021                     input[input_offset] * out_backprop[out_backprop_offset];
1022               }
1023             }
1024           }
1025         }
1026       }
1027     }
1028   }
1029 }
1030 
1031 // Extern template instantiated in conv_grad_filter_ops.cc.
1032 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
1033 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
1034 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
1035 
1036 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1037 
1038 // Extern template instantiated in conv_grad_filter_ops.cc.
1039 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
1040 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
1041 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1042 
1043 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1044 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1045                                                            Eigen::half>;
1046 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1047 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1048 
1049 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1050 
1051 // Kernel to compute the filter backprop for depthwise convolution.
1052 template <typename Device, class T>
1053 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1054  public:
DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction * context)1055   explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1056       : OpKernel(context) {
1057     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1058     OP_REQUIRES(context, strides_.size() == 4,
1059                 errors::InvalidArgument("Sliding window strides field must "
1060                                         "specify 4 dimensions"));
1061 
1062     string data_format;
1063     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1064     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1065                 errors::InvalidArgument("Invalid data format"));
1066 
1067     stride_ = GetTensorDim(strides_, data_format_, 'H');
1068     const int64_t stride_w = GetTensorDim(strides_, data_format_, 'W');
1069     const int64_t stride_n = GetTensorDim(strides_, data_format_, 'N');
1070     const int64_t stride_c = GetTensorDim(strides_, data_format_, 'C');
1071 
1072     OP_REQUIRES(context, stride_ == stride_w,
1073                 errors::InvalidArgument(
1074                     "Current implementation only supports equal length "
1075                     "strides in the row and column dimensions."));
1076     OP_REQUIRES(
1077         context, (stride_n == 1 && stride_c == 1),
1078         errors::InvalidArgument("Current implementation does not yet support "
1079                                 "strides in the batch and depth dimensions."));
1080     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1081     OP_REQUIRES_OK(context,
1082                    context->GetAttr("explicit_paddings", &explicit_paddings_));
1083     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
1084                                               /*num_dims=*/4, data_format_));
1085 
1086     cudnn_use_autotune_ = CudnnUseAutotune();
1087 
1088     if (std::is_same<T, Eigen::half>::value) {
1089       dtype_ = DT_HALF;
1090     } else if (std::is_same<T, float>::value) {
1091       dtype_ = DT_FLOAT;
1092     } else if (std::is_same<T, double>::value) {
1093       dtype_ = DT_DOUBLE;
1094     } else {
1095       LOG(ERROR) << "Only half, float, and double are supported.";
1096     }
1097     // Use CuDNN grouped conv (filter gradients) when input/output is
1098     // float16(half). See cudnn release note 7.6.3. (https://docs.nvidia.com/dee
1099     // plearning/sdk/cudnn-release-notes/rel_763.html#rel_763)
1100 #if CUDNN_VERSION >= 7603
1101     use_cudnn_grouped_conv_ = dtype_ == DT_HALF;
1102 #else
1103     use_cudnn_grouped_conv_ = false;
1104 #endif
1105   }
1106 
Compute(OpKernelContext * context)1107   void Compute(OpKernelContext* context) override {
1108     const Tensor& input = context->input(0);
1109     const Tensor& filter_sizes = context->input(1);
1110     OP_REQUIRES(
1111         context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1112         errors::InvalidArgument(
1113             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1114             filter_sizes.dims()));
1115     TensorShape filter_shape;
1116     const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1117     for (int i = 0; i < filter_sizes.NumElements(); ++i) {
1118       OP_REQUIRES(context, filter_sizes_data[i] >= 0,
1119                   errors::InvalidArgument("Dimension ", i,
1120                                           " of filter_sizes must be >= 0"));
1121       filter_shape.AddDim(filter_sizes_data[i]);
1122     }
1123     const TensorShape& input_shape = input.shape();
1124 
1125     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1126     Tensor* filter_backprop = nullptr;
1127     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1128                                 {1}, 0, filter_shape, &filter_backprop));
1129 
1130     // If there is nothing to compute, return.
1131     if (out_backprop.shape().num_elements() == 0) {
1132       return;
1133     }
1134 
1135     // If in_depth==1, this operation is just a standard convolution.
1136     // Depthwise convolution is a special case of cuDNN's grouped convolution.
1137     bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
1138                      (in_depth == 1 ||
1139                       (use_cudnn_grouped_conv_ &&
1140                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
1141                                                   /*filter_cols=*/filter_cols,
1142                                                   /*in_depth=*/in_depth,
1143                                                   /*out_depth=*/out_depth)));
1144 
1145     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
1146             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
1147             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1148             << filter_cols << ", " << in_depth << ", " << depth_multiplier
1149             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1150             << ", " << out_depth << "], stride = " << stride_
1151             << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
1152             << ", Use cuDNN: " << use_cudnn;
1153 
1154     if (use_cudnn) {
1155       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1156       //
1157       //                  | TensorFlow       | cuDNN
1158       // --------------------------------------------------------------------
1159       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
1160       // filter_in_depth  | in_depth         | in_depth / group_count
1161       //
1162       // For depthwise convolution, we have group_count == in_depth.
1163       int32_t filter_in_depth = 1;
1164       TensorShape shape =
1165           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
1166       Tensor reshaped_filter(/*type=*/dtype_);
1167       OP_REQUIRES(
1168           context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1169           errors::Internal(
1170               "Failed to reshape filter tensor for grouped convolution."));
1171 
1172       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1173       // conv is supported.
1174       launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
1175                 input,
1176                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
1177                 padding_, explicit_paddings_, &reshaped_filter, data_format_);
1178       return;
1179     }
1180 
1181     // For GPU inputs with type half, we cast inputs to float and outputs back
1182     // to half, as half implementation is slow and does not use full precision
1183     // accumulation in some cases.
1184     constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
1185                                    std::is_same<Device, GPUDevice>::value;
1186     using U = typename std::conditional<cast_to_float, float, T>::type;
1187     Tensor casted_out_backprop = out_backprop;
1188     Tensor casted_input = input;
1189     Tensor casted_filter_backprop = *filter_backprop;
1190     const Device& device = context->template eigen_device<Device>();
1191     if (cast_to_float) {
1192       functor::CastFunctor<Device, float, Eigen::half> cast;
1193       OP_REQUIRES_OK(context,
1194                      context->allocate_temp(DT_FLOAT, out_backprop.shape(),
1195                                             &casted_out_backprop));
1196       cast(device, casted_out_backprop.template flat<float>(),
1197            out_backprop.template flat<Eigen::half>());
1198       OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
1199                                                      &casted_input));
1200       cast(device, casted_input.template flat<float>(),
1201            input.template flat<Eigen::half>());
1202       OP_REQUIRES_OK(context,
1203                      context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
1204                                             &casted_filter_backprop));
1205     }
1206 
1207     auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
1208     auto input_ptr = casted_input.template flat<U>().data();
1209     auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
1210     LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
1211         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1212         data_format_);
1213 
1214     if (cast_to_float) {
1215       functor::CastFunctor<Device, Eigen::half, float> cast;
1216       const Tensor& casted_filter_backprop_const = casted_filter_backprop;
1217       cast(device, filter_backprop->template flat<Eigen::half>(),
1218            casted_filter_backprop_const.template flat<float>());
1219     }
1220   }
1221 
1222  protected:
1223   bool use_cudnn_grouped_conv_;
1224 
1225  private:
1226   std::vector<int32> strides_;
1227   Padding padding_;
1228   std::vector<int64> explicit_paddings_;
1229   TensorFormat data_format_;
1230   int64 stride_;
1231 
1232   // For in_depth == 1 and grouped convolutions.
1233   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1234   bool cudnn_use_autotune_;
1235   DataType dtype_;
1236 
1237   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1238 };
1239 
1240 #define REGISTER_CPU_KERNEL(T)                    \
1241   REGISTER_KERNEL_BUILDER(                        \
1242       Name("DepthwiseConv2dNativeBackpropFilter") \
1243           .Device(DEVICE_CPU)                     \
1244           .TypeConstraint<T>("T"),                \
1245       DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1246 TF_CALL_half(REGISTER_CPU_KERNEL);
1247 TF_CALL_float(REGISTER_CPU_KERNEL);
1248 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
1249 TF_CALL_double(REGISTER_CPU_KERNEL);
1250 #endif
1251 #undef REGISTER_CPU_KERNEL
1252 
1253 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1254 #define REGISTER_GPU_KERNEL(T)                                        \
1255   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1256                               .Device(DEVICE_GPU)                     \
1257                               .TypeConstraint<T>("T")                 \
1258                               .HostMemory("filter_sizes"),            \
1259                           DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1260 
1261 TF_CALL_half(REGISTER_GPU_KERNEL);
1262 TF_CALL_float(REGISTER_GPU_KERNEL);
1263 TF_CALL_double(REGISTER_GPU_KERNEL);
1264 #undef REGISTER_GPU_KERNEL
1265 
1266 #if CUDNN_VERSION >= 7000
1267 template <typename T>
1268 class DepthwiseConv2dGroupedConvBackpropFilterOp
1269     : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1270  public:
DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction * context)1271   DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1272       : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1273     this->use_cudnn_grouped_conv_ = true;
1274   }
1275 };
1276 
1277 #define REGISTER_GROUPED_CONV_KERNEL(T)                               \
1278   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1279                               .Device(DEVICE_GPU)                     \
1280                               .TypeConstraint<T>("T")                 \
1281                               .HostMemory("filter_sizes")             \
1282                               .Label("cudnn_grouped_convolution"),    \
1283                           DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1284 
1285 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1286 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1287 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1288 #undef REGISTER_GROUPED_CONV_KERNEL
1289 #endif  // CUDNN_VERSION
1290 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1291 
1292 }  // namespace tensorflow
1293