1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #define EIGEN_USE_THREADS
17
18 #include <algorithm>
19 #include <cmath>
20
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/numeric_op.h"
23 #include "tensorflow/core/framework/op_kernel.h"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/tensor_types.h"
28 #include "tensorflow/core/framework/types.h"
29 #include "tensorflow/core/kernels/conv_grad_ops.h"
30 #include "tensorflow/core/kernels/depthwise_conv_op.h"
31 #include "tensorflow/core/kernels/ops_util.h"
32 #include "tensorflow/core/lib/core/status.h"
33 #include "tensorflow/core/platform/logging.h"
34 #include "tensorflow/core/platform/types.h"
35 #include "tensorflow/core/util/padding.h"
36 #include "tensorflow/core/util/tensor_format.h"
37 #include "tensorflow/core/util/use_cudnn.h"
38 #include "tensorflow/core/util/work_sharder.h"
39
40 #if GOOGLE_CUDA
41 #include "cuda/include/cudnn.h"
42 #include "tensorflow/core/platform/stream_executor.h"
43 #endif // GOOGLE_CUDA
44
45 namespace tensorflow {
46
47 // Gradient operations for depthwise convolution.
48
49 typedef Eigen::ThreadPoolDevice CPUDevice;
50 typedef Eigen::GpuDevice GPUDevice;
51
52 // Common code between the two backward pass kernels: verifies that the
53 // dimensions all match and extract the padded rows and columns.
54 #define EXTRACT_AND_VERIFY_DIMENSIONS(label) \
55 const Tensor& out_backprop = context->input(2); \
56 OP_REQUIRES( \
57 context, input_shape.dims() == 4, \
58 errors::InvalidArgument(label, ": input must be 4-dimensional")); \
59 OP_REQUIRES( \
60 context, filter_shape.dims() == 4, \
61 errors::InvalidArgument(label, ": filter must be 4-dimensional")); \
62 OP_REQUIRES( \
63 context, out_backprop.dims() == 4, \
64 errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
65 const int64 batch = input_shape.dim_size(0); \
66 OP_REQUIRES( \
67 context, batch == out_backprop.dim_size(0), \
68 errors::InvalidArgument( \
69 label, ": input and out_backprop must have the same batch size")); \
70 const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H'); \
71 OP_REQUIRES( \
72 context, \
73 FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()), \
74 errors::InvalidArgument("Input rows too large")); \
75 const int32 input_rows = static_cast<int32>(input_rows_raw); \
76 const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W'); \
77 OP_REQUIRES( \
78 context, \
79 FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()), \
80 errors::InvalidArgument("Input cols too large")); \
81 const int32 input_cols = static_cast<int32>(input_cols_raw); \
82 const int64 filter_rows = filter_shape.dim_size(0); \
83 const int64 filter_cols = filter_shape.dim_size(1); \
84 const int64 output_rows_raw = \
85 GetTensorDim(out_backprop.shape(), data_format_, 'H'); \
86 OP_REQUIRES( \
87 context, \
88 FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()), \
89 errors::InvalidArgument("Output rows too large")); \
90 const int32 output_rows = static_cast<int32>(output_rows_raw); \
91 const int64 output_cols_raw = \
92 GetTensorDim(out_backprop.shape(), data_format_, 'W'); \
93 OP_REQUIRES( \
94 context, \
95 FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()), \
96 errors::InvalidArgument("Output cols too large")); \
97 const int32 output_cols = static_cast<int32>(output_cols_raw); \
98 const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C'); \
99 OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \
100 errors::InvalidArgument( \
101 label, ": input and filter must have the same in_depth")); \
102 const int64 depth_multiplier = filter_shape.dim_size(3); \
103 const int64 out_depth_raw = \
104 GetTensorDim(out_backprop.shape(), data_format_, 'C'); \
105 OP_REQUIRES( \
106 context, \
107 FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()), \
108 errors::InvalidArgument("Output depth too large")); \
109 const int32 out_depth = static_cast<int32>(out_depth_raw); \
110 OP_REQUIRES( \
111 context, (depth_multiplier * in_depth) == out_depth, \
112 errors::InvalidArgument( \
113 label, ": depth_multiplier * in_depth not equal to out_depth")); \
114 const auto stride = stride_; \
115 int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; \
116 OP_REQUIRES_OK(context, \
117 GetWindowedOutputSize(input_rows, filter_rows, stride, \
118 padding_, &out_rows, &pad_rows)); \
119 OP_REQUIRES_OK(context, \
120 GetWindowedOutputSize(input_cols, filter_cols, stride, \
121 padding_, &out_cols, &pad_cols)); \
122 OP_REQUIRES( \
123 context, output_rows == out_rows, \
124 errors::InvalidArgument( \
125 label, ": Number of rows of out_backprop doesn't match computed: ", \
126 "actual = ", output_rows, ", computed = ", out_rows)); \
127 OP_REQUIRES( \
128 context, output_cols == out_cols, \
129 errors::InvalidArgument( \
130 label, ": Number of cols of out_backprop doesn't match computed: ", \
131 "actual = ", output_cols, ", computed = ", out_cols)); \
132 DepthwiseArgs args; \
133 args.batch = batch; \
134 args.in_rows = input_rows; \
135 args.in_cols = input_cols; \
136 args.in_depth = in_depth; \
137 args.filter_rows = filter_rows; \
138 args.filter_cols = filter_cols; \
139 args.depth_multiplier = depth_multiplier; \
140 args.stride = stride; \
141 args.pad_rows = pad_rows; \
142 args.pad_cols = pad_cols; \
143 args.out_rows = out_rows; \
144 args.out_cols = out_cols; \
145 args.out_depth = out_depth; \
146 VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", " \
147 << input_rows << ", " << input_cols << ", " << in_depth \
148 << "]; Filter: [" << filter_rows << ", " << filter_cols << ", " \
149 << in_depth << ", " << depth_multiplier << "]; stride = " << stride \
150 << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols \
151 << ", output: [" << batch << ", " << out_rows << ", " << out_cols \
152 << ", " << out_depth << "]";
153
154 // Copies data from local region in 'out_backprop' into 'buffer'.
155 // The local region coordinates are calculated as the set of output points which
156 // used the input point ('in_r', 'in_'c') as input during the forward pass.
157 // Rather than spatially reversing the filter, the input is reversed during
158 // the copy. The copied data is padded to vector register-width boundaries so
159 // that it is aligned for efficient traversal and vector multiply-add by the
160 // depthwise input kernel.
161 //
162 // EX:
163 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
164 //
165 // 'out_backprop': [batch, out_rows, out_cols, out_depth]
166 //
167 // [a00, a01, a10, a11] [a20, a21, b00, b01]
168 // [b10, b11, b20, b21] [...]
169 // [e00, e01, e10, e11] [e20, e21, f00, f01]
170 // [f10, f11, f20, f21] [...]
171 //
172 // 'buffer' (register boundaries shown):
173 //
174 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
175 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
176 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
177 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
178 //
179 template <typename T>
CopyOutputBackpropRegion(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * out_backprop,T * buffer)180 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
181 const int64 padded_filter_inner_dim_size,
182 const int64 in_r, const int64 in_c,
183 const T* out_backprop, T* buffer) {
184 typedef typename Eigen::internal::packet_traits<T>::type Packet;
185 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
186
187 const int64 stride = args.stride;
188 const int64 filter_rows = args.filter_rows;
189 const int64 filter_cols = args.filter_cols;
190 const int64 pad_rows = args.pad_rows;
191 const int64 pad_cols = args.pad_cols;
192 const int64 out_rows = args.out_rows;
193 const int64 out_cols = args.out_cols;
194
195 // Calculate the output spatial region which used point (in_r, in_c) as input.
196 const int64 out_r_start = std::max(
197 static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride);
198 const int64 out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
199 const int64 out_c_start = std::max(
200 static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride);
201 const int64 out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
202
203 // Zero-pad 'buffer' if output region is smaller than filter spatial size.
204 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
205 if ((out_r_end - out_r_start + 1) < args.filter_rows ||
206 (out_c_end - out_c_start + 1) < args.filter_cols) {
207 memset(buffer, 0,
208 filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
209 }
210
211 // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
212 const int64 vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
213 const int64 scalar_size = args.out_depth % kPacketSize;
214 const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
215
216 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
217 const int64 f_r = in_r + pad_rows - out_r * stride;
218 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
219 const int64 f_c = in_c + pad_cols - out_c * stride;
220 const int64 buf_base =
221 (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
222 // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
223 auto* out_bprop =
224 out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
225
226 // Copy vectorized portion of inner dimension into 'buffer'.
227 for (int64 d = 0; d < vectorized_size; d += kPacketSize) {
228 auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
229 Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
230 }
231 // Copy scalar portion of out_bprop to 'buffer'
232 for (int64 d = 0; d < scalar_size; ++d) {
233 buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
234 }
235 // Pad to vector-register width (if needed).
236 for (int64 d = 0; d < pad_size; ++d) {
237 buffer[buf_base + vectorized_size + scalar_size + d] =
238 static_cast<T>(0);
239 }
240 }
241 }
242 }
243
244 // Computes the vectorized product of 'buffer' and 'filter' and stores
245 // result in 'output' at location computed from 'in_r' and 'in_c'.
246 // If depth_multiplier is > 1, the intermediate output is reduced along
247 // the depth_multiplier dimension.
248 //
249 // EX:
250 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
251 // Both 'input_buffer' and 'filter' are padded to register-width boundaries.
252 //
253 // 'buffer' [rows, cols, in_depth, depth_multiplier]
254 //
255 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
256 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
257 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
258 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
259 //
260 // filter [rows, cols, in_depth, depth_multiplier]
261 // [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
262 // [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
263 //
264 // First output register [in_depth, depth_multiplier]
265 // [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
266 // ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
267 // ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
268 // ([a00, a01, a10, a11] x [u3, v3, w3, x3])
269 //
270 // Reduction step along depth-multiplier dimension:
271 //
272 // [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
273 //
274
275 template <typename T>
ComputeBackpropInput(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * filter,const T * buffer,T * out_buffer,T * output)276 static void ComputeBackpropInput(const DepthwiseArgs& args,
277 const int64 padded_filter_inner_dim_size,
278 const int64 in_r, const int64 in_c,
279 const T* filter, const T* buffer,
280 T* out_buffer, T* output) {
281 typedef typename Eigen::internal::packet_traits<T>::type Packet;
282 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
283
284 const int64 in_depth = args.in_depth;
285 const int64 depth_multiplier = args.depth_multiplier;
286 const int64 out_depth = args.out_depth;
287 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
288
289 // Calculate vectorized and scalar lengths of 'out_depth'.
290 const int64 output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
291 const int64 output_scalar_size = out_depth % kPacketSize;
292
293 // Calculate base index at which to begin writing output.
294 const int64 base_output_index = (in_r * args.in_cols + in_c) * in_depth;
295
296 // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
297 // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
298 const int64 dm_vectorized_size =
299 (depth_multiplier / kPacketSize) * kPacketSize;
300 const int64 dm_scalar_size = depth_multiplier % kPacketSize;
301
302 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
303 // Reset accumulator.
304 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
305 for (int j = 0; j < filter_spatial_size; ++j) {
306 // Calculate index.
307 const int64 index = i + j * padded_filter_inner_dim_size;
308 // Load filter.
309 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
310 // Load input.
311 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
312 // Vector multiply-add.
313 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
314 }
315 if (depth_multiplier == 1) {
316 // Write directly to the output.
317 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
318 } else {
319 // Buffer output for subsequent reduction step.
320 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
321 }
322 }
323
324 if (output_scalar_size > 0) {
325 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
326 for (int j = 0; j < filter_spatial_size; ++j) {
327 const int64 index =
328 output_vectorized_size + j * padded_filter_inner_dim_size;
329 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
330 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
331 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
332 }
333 // Load accumulator into an array and loop through output.
334 T out_buf[kPacketSize];
335 Eigen::internal::pstoreu<T>(out_buf, vaccum);
336 if (depth_multiplier == 1) {
337 // Write directly to the output.
338 for (int j = 0; j < output_scalar_size; ++j) {
339 output[base_output_index + output_vectorized_size + j] = out_buf[j];
340 }
341 } else {
342 // Buffer output for subsequent reduction step.
343 for (int j = 0; j < output_scalar_size; ++j) {
344 out_buffer[output_vectorized_size + j] = out_buf[j];
345 }
346 }
347 }
348
349 // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
350 if (depth_multiplier > 1) {
351 for (int64 d = 0; d < in_depth; ++d) {
352 const int64 index = d * args.depth_multiplier;
353 T accum = static_cast<T>(0);
354 for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
355 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
356 accum += Eigen::internal::predux(v);
357 }
358 // Copy scalar portion of replicated output.
359 for (int64 dm = 0; dm < dm_scalar_size; ++dm) {
360 accum += out_buffer[index + dm_vectorized_size + dm];
361 }
362 // Copy to output.
363 output[base_output_index + d] = accum;
364 }
365 }
366 }
367
368 // Computes the depthwise conv2d backprop input of 'out_backprop' by
369 // 'depthwise_filter' and stores the result in 'in_backprop'.
370 template <typename T>
371 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
372 typedef typename Eigen::internal::packet_traits<T>::type Packet;
373
operator ()tensorflow::LaunchDepthwiseConvBackpropInputOp374 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
375 const T* out_backprop, const T* depthwise_filter,
376 T* in_backprop, TensorFormat data_format) {
377 OP_REQUIRES(
378 ctx, data_format == FORMAT_NHWC,
379 errors::Unimplemented(
380 "Depthwise convolution on CPU is only supported for NHWC format"));
381
382 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
383
384 // Pad 'depthwise_filter' to vector register width (if needed).
385 const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
386 Tensor padded_filter;
387 if (pad_filter) {
388 // Allocate space for padded filter.
389 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
390 const int64 padded_filter_inner_dim_size =
391 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
392 OP_REQUIRES_OK(
393 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
394 TensorShape({filter_spatial_size,
395 padded_filter_inner_dim_size}),
396 &padded_filter));
397 // Write out padded filter.
398 functor::DepthwiseFilterPadOp<T>()(
399 args, depthwise_filter, padded_filter.template flat<T>().data());
400 }
401 const T* filter_data =
402 pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
403
404 // Computes one shard of depthwise conv2d backprop input.
405 auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
406 int64 start, int64 limit) {
407 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
408
409 const int64 input_image_size =
410 args.in_rows * args.in_cols * args.in_depth;
411 const int64 output_image_size =
412 args.out_rows * args.out_cols * args.out_depth;
413 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
414 const int64 padded_filter_inner_dim_size =
415 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
416
417 // Allocate buffer to copy regions from 'out_backprop'.
418 Tensor out_bprop_buffer;
419 OP_REQUIRES_OK(
420 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
421 TensorShape({filter_spatial_size,
422 padded_filter_inner_dim_size}),
423 &out_bprop_buffer));
424 T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
425
426 // Allocate buffer for intermediate results.
427 Tensor in_bprop_buffer;
428 OP_REQUIRES_OK(
429 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
430 TensorShape({padded_filter_inner_dim_size}),
431 &in_bprop_buffer));
432 T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
433
434 for (int64 b = start; b < limit; ++b) {
435 for (int64 in_r = 0; in_r < args.in_rows; ++in_r) {
436 for (int64 in_c = 0; in_c < args.in_cols; ++in_c) {
437 // Populate 'out_bprop_buf' from local 'out_backprop' region.
438 CopyOutputBackpropRegion<T>(
439 args, padded_filter_inner_dim_size, in_r, in_c,
440 out_backprop + b * output_image_size, out_bprop_buf);
441
442 // Compute depthwise backprop input.
443 ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
444 in_c, filter_data, out_bprop_buf,
445 in_bprop_buf,
446 in_backprop + b * input_image_size);
447 }
448 }
449 }
450 };
451
452 const int64 shard_cost = args.in_rows * args.in_cols * args.out_depth;
453 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
454 Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
455 shard_cost, shard);
456 }
457 };
458
459 template <typename T>
DepthwiseConvBackpropInputReference(const DepthwiseArgs & args,const T * out_backprop,const T * filter,T * in_backprop)460 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
461 const T* out_backprop,
462 const T* filter,
463 T* in_backprop) {
464 // Naive for loop as a reference point without concerns about performance.
465 for (int b = 0; b < args.batch; ++b) {
466 for (int in_r = 0; in_r < args.in_rows; ++in_r) {
467 for (int in_c = 0; in_c < args.in_cols; ++in_c) {
468 for (int in_d = 0; in_d < args.in_depth; ++in_d) {
469 T sum = 0;
470 const int stride = args.stride;
471 const int out_d_start = in_d * args.depth_multiplier;
472 const int out_d_end = out_d_start + args.depth_multiplier;
473
474 for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
475 const int out_r_start = std::max(
476 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
477 const int out_r_end =
478 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
479
480 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
481 const int out_c_start = std::max(
482 0,
483 (in_c - args.filter_cols + args.pad_cols + stride) / stride);
484 const int out_c_end =
485 std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
486
487 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
488 int f_r = in_r + args.pad_rows - out_r * stride;
489 int f_c = in_c + args.pad_cols - out_c * stride;
490 int filter_dm = out_d - out_d_start;
491 int out_backprop_offset =
492 out_d +
493 args.out_depth *
494 (out_c + args.out_cols * (out_r + args.out_rows * b));
495 int filter_offset =
496 filter_dm +
497 args.depth_multiplier *
498 (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
499 sum +=
500 out_backprop[out_backprop_offset] * filter[filter_offset];
501 }
502 }
503 }
504
505 int in_backprop_offset =
506 in_d +
507 args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
508 in_backprop[in_backprop_offset] = sum;
509 }
510 }
511 }
512 }
513 }
514
515 // Extern template instantiated in conv_grad_input_ops.cc.
516 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
517 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
518 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
519
520 #if GOOGLE_CUDA
521
522 // Extern template instantiated in conv_grad_input_ops.cc.
523 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
524 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
525 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
526
527 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
528 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
529 Eigen::half>;
530 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
531 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
532
533 #endif // GOOGLE_CUDA
534
535 // Kernel to compute the input backprop for depthwise convolution.
536 template <typename Device, class T>
537 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
538 public:
DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction * context)539 explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
540 : OpKernel(context) {
541 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
542 OP_REQUIRES(context, strides_.size() == 4,
543 errors::InvalidArgument("Sliding window strides field must "
544 "specify 4 dimensions"));
545
546 string data_format;
547 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
548 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
549 errors::InvalidArgument("Invalid data format"));
550
551 stride_ = GetTensorDim(strides_, data_format_, 'H');
552 const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
553 const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
554 const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
555
556 OP_REQUIRES(context, stride_ == stride_w,
557 errors::InvalidArgument(
558 "Current implementation only supports equal length "
559 "strides in the row and column dimensions."));
560 OP_REQUIRES(
561 context, (stride_n == 1 && stride_c == 1),
562 errors::InvalidArgument("Current implementation does not yet support "
563 "strides in the batch and depth dimensions."));
564 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
565
566 // For in_depth == 1 and grouped convolutions.
567 use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
568 cudnn_use_autotune_ = CudnnUseAutotune();
569 use_cudnn_grouped_conv_ = false;
570 dtype_ = DataTypeToEnum<T>::value;
571 }
572
Compute(OpKernelContext * context)573 void Compute(OpKernelContext* context) override {
574 const Tensor& input_sizes = context->input(0);
575 const Tensor& filter = context->input(1);
576 OP_REQUIRES(
577 context, TensorShapeUtils::IsVector(input_sizes.shape()),
578 errors::InvalidArgument(
579 "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
580 input_sizes.dims()));
581 TensorShape input_shape;
582 const int32* in_sizes_data = input_sizes.template flat<int32>().data();
583
584 for (int i = 0; i < input_sizes.NumElements(); ++i) {
585 OP_REQUIRES(context, in_sizes_data[i] >= 0,
586 errors::InvalidArgument("Dimension ", i,
587 " of input_sizes must be >= 0"));
588 input_shape.AddDim(in_sizes_data[i]);
589 }
590 const TensorShape& filter_shape = filter.shape();
591 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
592
593 Tensor* in_backprop = nullptr;
594 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
595 {0}, 0, input_shape, &in_backprop));
596
597 // If there is nothing to compute, return.
598 if (input_shape.num_elements() == 0) {
599 return;
600 }
601
602 // If in_depth==1, this operation is just a standard convolution.
603 // Depthwise convolution is a special case of cuDNN's grouped convolution.
604 bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
605
606 VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
607 << " Input: [" << batch << ", " << input_rows << ", " << input_cols
608 << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
609 << filter_cols << ", " << in_depth << ", " << depth_multiplier
610 << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
611 << ", " << out_depth << "], stride = " << stride_
612 << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
613 << ", Use cuDNN: " << use_cudnn;
614
615 if (use_cudnn) {
616 // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
617 //
618 // | TensorFlow | cuDNN
619 // --------------------------------------------------------------------
620 // filter_out_depth | depth_multiplier | depth_multiplier * group_count
621 // filter_in_depth | in_depth | in_depth / group_count
622 //
623 // For depthwise convolution, we have group_count == in_depth.
624 int32 filter_in_depth = 1;
625 TensorShape shape =
626 TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
627 Tensor reshaped_filter(/*type=*/dtype_);
628 OP_REQUIRES(
629 context, reshaped_filter.CopyFrom(filter, shape),
630 errors::Internal(
631 "Failed to reshape filter tensor for grouped convolution."));
632 // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
633 // conv is supported.
634 launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
635 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
636 stride_, stride_, padding_, /*explicit_paddings=*/{},
637 in_backprop, data_format_);
638 return;
639 }
640
641 auto out_backprop_ptr = out_backprop.template flat<T>().data();
642 auto filter_ptr = filter.template flat<T>().data();
643 auto in_backprop_ptr = in_backprop->template flat<T>().data();
644 LaunchDepthwiseConvBackpropInputOp<Device, T>()(
645 context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
646 data_format_);
647 }
648
649 protected:
650 bool use_cudnn_grouped_conv_;
651
652 private:
653 std::vector<int32> strides_;
654 Padding padding_;
655 TensorFormat data_format_;
656 int64 stride_;
657
658 // For in_depth == 1 and grouped convolutions.
659 LaunchConv2DBackpropInputOp<Device, T> launcher_;
660 bool use_cudnn_;
661 bool cudnn_use_autotune_;
662 DataType dtype_;
663
664 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
665 };
666
667 #define REGISTER_CPU_KERNEL(T) \
668 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
669 .Device(DEVICE_CPU) \
670 .TypeConstraint<T>("T"), \
671 DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
672
673 TF_CALL_half(REGISTER_CPU_KERNEL);
674 TF_CALL_float(REGISTER_CPU_KERNEL);
675 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
676 TF_CALL_double(REGISTER_CPU_KERNEL);
677 #endif
678 #undef REGISTER_CPU_KERNEL
679
680 #if GOOGLE_CUDA
681
682 #define REGISTER_GPU_KERNEL(T) \
683 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
684 .Device(DEVICE_GPU) \
685 .TypeConstraint<T>("T") \
686 .HostMemory("input_sizes"), \
687 DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
688
689 TF_CALL_half(REGISTER_GPU_KERNEL);
690 TF_CALL_float(REGISTER_GPU_KERNEL);
691 TF_CALL_double(REGISTER_GPU_KERNEL);
692 #undef REGISTER_GPU_KERNEL
693
694 #if CUDNN_VERSION >= 7000
695 template <typename T>
696 class DepthwiseConv2dGroupedConvBackpropInputOp
697 : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
698 public:
DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction * context)699 DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
700 : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
701 this->use_cudnn_grouped_conv_ = true;
702 }
703 };
704
705 #define REGISTER_GROUPED_CONV_KERNEL(T) \
706 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
707 .Device(DEVICE_GPU) \
708 .TypeConstraint<T>("T") \
709 .HostMemory("input_sizes") \
710 .Label("cudnn_grouped_convolution"), \
711 DepthwiseConv2dGroupedConvBackpropInputOp<T>)
712
713 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
714 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
715 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
716 #undef REGISTER_GROUPED_CONV_KERNEL
717 #endif // CUDNN_VERSION
718 #endif // GOOGLE_CUDA
719
720 // Kernels to compute the gradients of the filters for depthwise convolution.
721
722 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
723 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
724 //
725 // EX:
726 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
727 // Both 'input_buffer' and 'filter' are padded to register-width boundaries.
728 //
729 // 'input_buffer' [rows, cols, in_depth, depth_multiplier]
730 //
731 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
732 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
733 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
734 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
735 //
736 // 'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
737 //
738 // [q00, q01, q10, q11] [q20, q21, r00, r01]
739 // [r10, r11, r20, r21] [s00, s01, s10, s11]
740 // [s20, s21, t00, t01] [t10, t11, t20, a21]
741 //
742 // First output register of 'filter_backprop'
743 // [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
744 //
745 template <typename T>
ComputeBackpropFilter(const DepthwiseArgs & args,const int64 padded_out_depth_size,const int64 out_r,const int64 out_c,const T * out_backprop,const T * input_buffer,T * output_buffer)746 static void ComputeBackpropFilter(const DepthwiseArgs& args,
747 const int64 padded_out_depth_size,
748 const int64 out_r, const int64 out_c,
749 const T* out_backprop, const T* input_buffer,
750 T* output_buffer) {
751 typedef typename Eigen::internal::packet_traits<T>::type Packet;
752 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
753 // Calculate vectorized size of 'padded_out_depth_size'.
754 const int64 out_depth = args.out_depth;
755 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
756 const int64 output_vectorized_size =
757 (padded_out_depth_size / kPacketSize) * kPacketSize;
758 const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth;
759 // Determine whether we can execute fast or slow code path.
760 const int64 output_image_size =
761 args.out_rows * args.out_cols * args.out_depth;
762 const int64 output_last_vector_index =
763 output_image_size - (filter_spatial_size * padded_out_depth_size);
764 const bool fast_path = base_output_index <= output_last_vector_index;
765
766 if (fast_path) {
767 // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
768 // amortize the cost of 'output_buffer' load store in the loop below.
769 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
770 // Load vector register from 'out_backprop'.
771 const auto out_bprop_block =
772 Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
773 for (int j = 0; j < filter_spatial_size; ++j) {
774 const int64 index = i + j * padded_out_depth_size;
775 // Load vector register from 'input_buffer'.
776 const auto input_block =
777 Eigen::internal::ploadu<Packet>(input_buffer + index);
778 // Load output block into vector register.
779 auto out_block_data = output_buffer + index;
780 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
781 // Vector multiply-add.
782 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
783 out_block);
784 // Store 'out_block' back to memory.
785 Eigen::internal::pstoreu<T>(out_block_data, out_block);
786 }
787 }
788 } else {
789 // Slow path (cant do vector reads from non-padded 'out_backprop'.
790 for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
791 // Calculate safe read size from 'out_backprop'.
792 const int64 out_bprop_index = base_output_index + i;
793 const int64 out_bprop_limit =
794 std::min(output_image_size, out_bprop_index + kPacketSize);
795 T out_buf[kPacketSize];
796 memset(&out_buf, 0, kPacketSize * sizeof(T));
797 const int64 scalar_size = out_bprop_limit - out_bprop_index;
798 for (int64 j = 0; j < scalar_size; ++j) {
799 out_buf[j] = out_backprop[out_bprop_index + j];
800 }
801 // Load vector register from 'out_buf'.
802 const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
803 for (int j = 0; j < filter_spatial_size; ++j) {
804 const int64 index = i + j * padded_out_depth_size;
805 // Load vector register from 'input_buffer'.
806 const auto input_block =
807 Eigen::internal::ploadu<Packet>(input_buffer + index);
808 // Load output block into vector register.
809 auto out_block_data = output_buffer + index;
810 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
811 // Vector multiply-add.
812 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
813 out_block);
814 // Store 'out_block' back to memory.
815 Eigen::internal::pstoreu<T>(out_block_data, out_block);
816 }
817 }
818 }
819 }
820
821 template <typename Device, typename T>
822 struct LaunchDepthwiseConvBackpropFilterOp;
823
824 template <typename T>
825 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
826 typedef typename Eigen::internal::packet_traits<T>::type Packet;
827
operator ()tensorflow::LaunchDepthwiseConvBackpropFilterOp828 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
829 const T* out_backprop, const T* input, T* filter_backprop,
830 TensorFormat data_format) {
831 OP_REQUIRES(
832 ctx, data_format == FORMAT_NHWC,
833 errors::Unimplemented(
834 "Depthwise convolution on CPU is only supported for NHWC format"));
835
836 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
837
838 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
839 const int64 padded_out_depth_size =
840 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
841
842 // Allocate output buffers for each image in 'batch' (padded to vector
843 // register boundaries).
844 Tensor output_buffer;
845 OP_REQUIRES_OK(
846 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
847 TensorShape({args.batch, filter_spatial_size,
848 padded_out_depth_size}),
849 &output_buffer));
850 T* output_buffer_data = output_buffer.template flat<T>().data();
851
852 // Computes one shard of depthwise conv2d backprop filter.
853 auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
854 int64 start, int64 limit) {
855 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
856 const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
857 const int64 padded_out_depth_size =
858 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
859
860 // Allocate buffer for local input regions.
861 Tensor input_buffer;
862 OP_REQUIRES_OK(
863 ctx, ctx->allocate_temp(
864 DataTypeToEnum<T>::value,
865 TensorShape({filter_spatial_size, padded_out_depth_size}),
866 &input_buffer));
867 T* input_buffer_data = input_buffer.template flat<T>().data();
868
869 const int64 input_image_size =
870 args.in_rows * args.in_cols * args.in_depth;
871 const int64 output_image_size =
872 args.out_rows * args.out_cols * args.out_depth;
873 const int64 padded_filter_size =
874 filter_spatial_size * padded_out_depth_size;
875
876 for (int b = start; b < limit; ++b) {
877 // Initialize 'output_buffer' for 'b'.
878 auto* output_buffer = output_buffer_data + b * padded_filter_size;
879 memset(output_buffer, 0, padded_filter_size * sizeof(T));
880
881 for (int out_r = 0; out_r < args.out_rows; ++out_r) {
882 for (int out_c = 0; out_c < args.out_cols; ++out_c) {
883 // Populate 'input_buffer_data' with data from local input region.
884 functor::DepthwiseInputCopyOp<T>()(
885 args, padded_out_depth_size, out_r, out_c,
886 input + b * input_image_size, input_buffer_data);
887 // Compute depthwise backprop filter.
888 ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
889 out_backprop + b * output_image_size,
890 input_buffer_data, output_buffer);
891 }
892 }
893 }
894 };
895 const int64 shard_cost = args.out_rows * args.out_cols * args.out_depth;
896 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
897 Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
898 shard_cost, shard);
899
900 // Accumulate 'output_buffer' from each shard into 'output'.
901 const int64 out_depth = args.out_depth;
902 const int64 vectorized_size = (out_depth / kPacketSize) * kPacketSize;
903 const int64 scalar_size = out_depth - vectorized_size;
904 const int64 padded_filter_size =
905 filter_spatial_size * padded_out_depth_size;
906 memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
907
908 for (int64 i = 0; i < filter_spatial_size; ++i) {
909 const int64 buffer_base = i * padded_out_depth_size;
910 const int64 output_base = i * out_depth;
911 // Write vectorized length of filter's inner dimension to output.
912 for (int64 j = 0; j < vectorized_size; j += kPacketSize) {
913 // Load data from 'filter_backprop' into vector register.
914 auto out_block_data = filter_backprop + output_base + j;
915 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
916 for (int b = 0; b < args.batch; ++b) {
917 // Load data from 'output_buffer' for 'b'.
918 const auto* output_buffer =
919 output_buffer_data + b * padded_filter_size;
920 const auto v =
921 Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
922 // Add 'v' to 'out_block'.
923 out_block = Eigen::internal::padd<Packet>(out_block, v);
924 }
925 // Store 'out_block' back to memory.
926 Eigen::internal::pstoreu<T>(out_block_data, out_block);
927 }
928 // Write scalar length of filter's inner dimension to output.
929 for (int64 j = 0; j < scalar_size; ++j) {
930 for (int b = 0; b < args.batch; ++b) {
931 const auto* output_buffer =
932 output_buffer_data + b * padded_filter_size;
933 filter_backprop[output_base + vectorized_size + j] +=
934 output_buffer[buffer_base + vectorized_size + j];
935 }
936 }
937 }
938 }
939 };
940
941 template <typename T>
DepthwiseConvBackpropFilterReference(const DepthwiseArgs & args,const T * out_backprop,const T * input,T * filter_backprop)942 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
943 const T* out_backprop,
944 const T* input,
945 T* filter_backprop) {
946 int num_filter_backprop = args.filter_rows * args.filter_cols *
947 args.in_depth * args.depth_multiplier;
948 memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
949 // Naive for loop as a reference point without concerns about performance.
950 for (int b = 0; b < args.batch; ++b) {
951 for (int out_r = 0; out_r < args.out_rows; ++out_r) {
952 for (int out_c = 0; out_c < args.out_cols; ++out_c) {
953 for (int out_d = 0; out_d < args.out_depth; ++out_d) {
954 const int in_d = out_d / args.depth_multiplier;
955 const int dm = out_d % args.depth_multiplier;
956 const int in_r_start = out_r * args.stride - args.pad_rows;
957 const int in_c_start = out_c * args.stride - args.pad_cols;
958
959 for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
960 for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
961 const int in_r = in_r_start + f_r;
962 const int in_c = in_c_start + f_c;
963
964 if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
965 in_c < args.in_cols) {
966 int out_backprop_offset =
967 out_d +
968 args.out_depth *
969 (out_c + args.out_cols * (out_r + args.out_rows * b));
970 int input_offset =
971 in_d +
972 args.in_depth *
973 (in_c + args.in_cols * (in_r + args.in_rows * b));
974 int filter_backprop_offset =
975 dm +
976 args.depth_multiplier *
977 (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
978 filter_backprop[filter_backprop_offset] +=
979 input[input_offset] * out_backprop[out_backprop_offset];
980 }
981 }
982 }
983 }
984 }
985 }
986 }
987 }
988
989 // Extern template instantiated in conv_grad_filter_ops.cc.
990 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
991 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
992 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
993
994 #if GOOGLE_CUDA
995
996 // Extern template instantiated in conv_grad_filter_ops.cc.
997 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
998 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
999 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1000
1001 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1002 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1003 Eigen::half>;
1004 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1005 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1006
1007 #endif // GOOGLE_CUDA
1008
1009 // Kernel to compute the filter backprop for depthwise convolution.
1010 template <typename Device, class T>
1011 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1012 public:
DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction * context)1013 explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1014 : OpKernel(context) {
1015 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1016 OP_REQUIRES(context, strides_.size() == 4,
1017 errors::InvalidArgument("Sliding window strides field must "
1018 "specify 4 dimensions"));
1019
1020 string data_format;
1021 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1022 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1023 errors::InvalidArgument("Invalid data format"));
1024
1025 stride_ = GetTensorDim(strides_, data_format_, 'H');
1026 const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
1027 const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
1028 const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
1029
1030 OP_REQUIRES(context, stride_ == stride_w,
1031 errors::InvalidArgument(
1032 "Current implementation only supports equal length "
1033 "strides in the row and column dimensions."));
1034 OP_REQUIRES(
1035 context, (stride_n == 1 && stride_c == 1),
1036 errors::InvalidArgument("Current implementation does not yet support "
1037 "strides in the batch and depth dimensions."));
1038 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1039
1040 // For in_depth == 1 and grouped convolutions.
1041 use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
1042 cudnn_use_autotune_ = CudnnUseAutotune();
1043 use_cudnn_grouped_conv_ = false;
1044
1045 if (std::is_same<T, Eigen::half>::value) {
1046 dtype_ = DT_HALF;
1047 } else if (std::is_same<T, float>::value) {
1048 dtype_ = DT_FLOAT;
1049 } else if (std::is_same<T, double>::value) {
1050 dtype_ = DT_DOUBLE;
1051 } else {
1052 LOG(ERROR) << "Only half, float, and double are supported.";
1053 }
1054 }
1055
Compute(OpKernelContext * context)1056 void Compute(OpKernelContext* context) override {
1057 const Tensor& input = context->input(0);
1058 const Tensor& filter_sizes = context->input(1);
1059 OP_REQUIRES(
1060 context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1061 errors::InvalidArgument(
1062 "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1063 filter_sizes.dims()));
1064 TensorShape filter_shape;
1065 const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1066 for (int i = 0; i < filter_sizes.NumElements(); ++i) {
1067 OP_REQUIRES(context, filter_sizes_data[i] >= 0,
1068 errors::InvalidArgument("Dimension ", i,
1069 " of filter_sizes must be >= 0"));
1070 filter_shape.AddDim(filter_sizes_data[i]);
1071 }
1072 const TensorShape& input_shape = input.shape();
1073
1074 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1075 Tensor* filter_backprop = nullptr;
1076 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1077 {1}, 0, filter_shape, &filter_backprop));
1078
1079 // If there is nothing to compute, return.
1080 if (out_backprop.shape().num_elements() == 0) {
1081 return;
1082 }
1083
1084 // If in_depth==1, this operation is just a standard convolution.
1085 // Depthwise convolution is a special case of cuDNN's grouped convolution.
1086 bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
1087
1088 VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
1089 << " Input: [" << batch << ", " << input_rows << ", " << input_cols
1090 << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1091 << filter_cols << ", " << in_depth << ", " << depth_multiplier
1092 << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1093 << ", " << out_depth << "], stride = " << stride_
1094 << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
1095 << ", Use cuDNN: " << use_cudnn;
1096
1097 if (use_cudnn) {
1098 // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1099 //
1100 // | TensorFlow | cuDNN
1101 // --------------------------------------------------------------------
1102 // filter_out_depth | depth_multiplier | depth_multiplier * group_count
1103 // filter_in_depth | in_depth | in_depth / group_count
1104 //
1105 // For depthwise convolution, we have group_count == in_depth.
1106 int32 filter_in_depth = 1;
1107 TensorShape shape =
1108 TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
1109 Tensor reshaped_filter(/*type=*/dtype_);
1110 OP_REQUIRES(
1111 context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1112 errors::Internal(
1113 "Failed to reshape filter tensor for grouped convolution."));
1114
1115 // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1116 // conv is supported.
1117 launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
1118 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
1119 padding_, /*explicit_paddings=*/{}, &reshaped_filter,
1120 data_format_);
1121 return;
1122 }
1123
1124 auto out_backprop_ptr = out_backprop.template flat<T>().data();
1125 auto input_ptr = input.template flat<T>().data();
1126 auto filter_backprop_ptr = filter_backprop->template flat<T>().data();
1127 LaunchDepthwiseConvBackpropFilterOp<Device, T>()(
1128 context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1129 data_format_);
1130 }
1131
1132 protected:
1133 bool use_cudnn_grouped_conv_;
1134
1135 private:
1136 std::vector<int32> strides_;
1137 Padding padding_;
1138 TensorFormat data_format_;
1139 int64 stride_;
1140
1141 // For in_depth == 1 and grouped convolutions.
1142 LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1143 bool use_cudnn_;
1144 bool cudnn_use_autotune_;
1145 DataType dtype_;
1146
1147 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1148 };
1149
1150 #define REGISTER_CPU_KERNEL(T) \
1151 REGISTER_KERNEL_BUILDER( \
1152 Name("DepthwiseConv2dNativeBackpropFilter") \
1153 .Device(DEVICE_CPU) \
1154 .TypeConstraint<T>("T"), \
1155 DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1156 TF_CALL_half(REGISTER_CPU_KERNEL);
1157 TF_CALL_float(REGISTER_CPU_KERNEL);
1158 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
1159 TF_CALL_double(REGISTER_CPU_KERNEL);
1160 #endif
1161 #undef REGISTER_CPU_KERNEL
1162
1163 #if GOOGLE_CUDA
1164 #define REGISTER_GPU_KERNEL(T) \
1165 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1166 .Device(DEVICE_GPU) \
1167 .TypeConstraint<T>("T") \
1168 .HostMemory("filter_sizes"), \
1169 DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1170
1171 TF_CALL_half(REGISTER_GPU_KERNEL);
1172 TF_CALL_float(REGISTER_GPU_KERNEL);
1173 TF_CALL_double(REGISTER_GPU_KERNEL);
1174 #undef REGISTER_GPU_KERNEL
1175
1176 #if CUDNN_VERSION >= 7000
1177 template <typename T>
1178 class DepthwiseConv2dGroupedConvBackpropFilterOp
1179 : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1180 public:
DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction * context)1181 DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1182 : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1183 this->use_cudnn_grouped_conv_ = true;
1184 }
1185 };
1186
1187 #define REGISTER_GROUPED_CONV_KERNEL(T) \
1188 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1189 .Device(DEVICE_GPU) \
1190 .TypeConstraint<T>("T") \
1191 .HostMemory("filter_sizes") \
1192 .Label("cudnn_grouped_convolution"), \
1193 DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1194
1195 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1196 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1197 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1198 #undef REGISTER_GROUPED_CONV_KERNEL
1199 #endif // CUDNN_VERSION
1200 #endif // GOOGLE_CUDA
1201
1202 } // namespace tensorflow
1203