1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17
18 #define EIGEN_USE_GPU
19
20 #include "tensorflow/core/framework/register_types.h"
21 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
22 #include "tensorflow/core/util/gpu_kernel_helper.h"
23 #include "tensorflow/core/util/tensor_format.h"
24
25 namespace tensorflow {
26
27 namespace {
28
29 template <typename dtype>
MaxPoolGradBackwardNoMaskNCDHW(const int nthreads,const dtype * __restrict__ bottom_data,const dtype * __restrict__ output_data,const int pooled_plane,const int pooled_height,const int pooled_width,const int channels,const int plane,const int height,const int width,const int kernel_p,const int kernel_h,const int kernel_w,const int stride_p,const int stride_h,const int stride_w,const int pad_p,const int pad_t,const int pad_l,const dtype * __restrict__ top_diff,dtype * __restrict__ bottom_diff)30 __global__ void MaxPoolGradBackwardNoMaskNCDHW(
31 const int nthreads, const dtype* __restrict__ bottom_data,
32 const dtype* __restrict__ output_data, const int pooled_plane,
33 const int pooled_height, const int pooled_width, const int channels,
34 const int plane, const int height, const int width, const int kernel_p,
35 const int kernel_h, const int kernel_w, const int stride_p,
36 const int stride_h, const int stride_w, const int pad_p, const int pad_t,
37 const int pad_l, const dtype* __restrict__ top_diff,
38 dtype* __restrict__ bottom_diff) {
39 GPU_1D_KERNEL_LOOP(index, nthreads) {
40 // First find out the index to the maximum, since we have no mask.
41 int pw = index % pooled_width;
42 int ph = (index / pooled_width) % pooled_height;
43 int pp = (index / pooled_width / pooled_height) % pooled_plane;
44 int c = (index / pooled_width / pooled_height / pooled_plane) % channels;
45 int n = (index / pooled_width / pooled_height / pooled_plane / channels);
46 int pstart = pp * stride_p - pad_p;
47 int hstart = ph * stride_h - pad_t;
48 int wstart = pw * stride_w - pad_l;
49 const int pend = min(pstart + kernel_p, plane);
50 const int hend = min(hstart + kernel_h, height);
51 const int wend = min(wstart + kernel_w, width);
52 pstart = max(pstart, 0);
53 hstart = max(hstart, 0);
54 wstart = max(wstart, 0);
55 bool should_stop = false;
56 int maxidx = -1;
57 const dtype* bottom_data_n =
58 bottom_data + n * channels * plane * height * width;
59 // Propagate only first value from top_diff corresponding to the maximum.
60 for (int p = pstart; p < pend && !should_stop; ++p) {
61 for (int h = hstart; h < hend && !should_stop; ++h) {
62 for (int w = wstart; w < wend && !should_stop; ++w) {
63 int idx = c * plane * height * width + (p * height + h) * width + w;
64 if (output_data[index] == bottom_data_n[idx]) {
65 maxidx = idx;
66 should_stop = true;
67 }
68 }
69 }
70 }
71 // Set the bottom diff (atomic is not necessary). The index could still be
72 // uninitialized, if all the bottom_data are NaN.
73 if (maxidx != -1) {
74 bottom_diff[index] =
75 top_diff[n * channels * plane * height * width + maxidx];
76 }
77 }
78 }
79
80 template <typename dtype>
MaxPoolGradBackwardNoMaskNDHWC(const int nthreads,const dtype * __restrict__ bottom_data,const dtype * __restrict__ output_data,const int pooled_plane,const int pooled_height,const int pooled_width,const int channels,const int plane,const int height,const int width,const int kernel_p,const int kernel_h,const int kernel_w,const int stride_p,const int stride_h,const int stride_w,const int pad_p,const int pad_t,const int pad_l,const dtype * __restrict__ top_diff,dtype * __restrict__ bottom_diff)81 __global__ void MaxPoolGradBackwardNoMaskNDHWC(
82 const int nthreads, const dtype* __restrict__ bottom_data,
83 const dtype* __restrict__ output_data, const int pooled_plane,
84 const int pooled_height, const int pooled_width, const int channels,
85 const int plane, const int height, const int width, const int kernel_p,
86 const int kernel_h, const int kernel_w, const int stride_p,
87 const int stride_h, const int stride_w, const int pad_p, const int pad_t,
88 const int pad_l, const dtype* __restrict__ top_diff,
89 dtype* __restrict__ bottom_diff) {
90 GPU_1D_KERNEL_LOOP(index, nthreads) {
91 // First find out the index to the maximum, since we have no mask.
92 int n = index;
93 int c = n % channels;
94 n /= channels;
95 int wstart = (n % pooled_width) * stride_w - pad_l;
96 int wend = min(wstart + kernel_w, width);
97 wstart = max(wstart, 0);
98 n /= pooled_width;
99 int hstart = (n % pooled_height) * stride_h - pad_t;
100 int hend = min(hstart + kernel_h, height);
101 hstart = max(hstart, 0);
102 n /= pooled_height;
103 int pstart = (n % pooled_plane) * stride_p - pad_p;
104 int pend = min(pstart + kernel_p, plane);
105 pstart = max(pstart, 0);
106 n /= pooled_plane;
107 bool should_stop = false;
108 int maxidx = -1;
109 const dtype* bottom_data_n =
110 bottom_data + n * plane * height * width * channels;
111 // Propagate only first value from top_diff corresponding to the maximum.
112 for (int p = pstart; p < pend && !should_stop; ++p) {
113 for (int h = hstart; h < hend && !should_stop; ++h) {
114 for (int w = wstart; w < wend && !should_stop; ++w) {
115 int idx = ((p * height + h) * width + w) * channels + c;
116 if (output_data[index] == bottom_data_n[idx]) {
117 maxidx = idx;
118 should_stop = true;
119 }
120 }
121 }
122 }
123 // Set the bottom diff (atomic is not necessary). The index could still be
124 // uninitialized, if all the bottom_data are NaN.
125 if (maxidx != -1) {
126 bottom_diff[index] =
127 top_diff[n * plane * height * width * channels + maxidx];
128 }
129 }
130 }
131
132 } // namespace
133
134 namespace functor {
135
136 template <typename T>
operator ()(TensorFormat data_format,const T * bottom_data,const T * output_data,const int batch,const int pooled_plane,const int pooled_height,const int pooled_width,const int channels,const int plane,const int height,const int width,const int kernel_p,const int kernel_h,const int kernel_w,const int stride_p,const int stride_h,const int stride_w,const int pad_p,const int pad_t,const int pad_l,const T * top_diff,T * bottom_diff,const Eigen::GpuDevice & d)137 bool MaxPool3dGradBackward<T>::operator()(
138 TensorFormat data_format, const T* bottom_data, const T* output_data,
139 const int batch, const int pooled_plane, const int pooled_height,
140 const int pooled_width, const int channels, const int plane,
141 const int height, const int width, const int kernel_p, const int kernel_h,
142 const int kernel_w, const int stride_p, const int stride_h,
143 const int stride_w, const int pad_p, const int pad_t, const int pad_l,
144 const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
145 int num_kernels =
146 batch * channels * pooled_plane * pooled_height * pooled_width;
147 GpuLaunchConfig config = GetGpuLaunchConfig(num_kernels, d);
148 if (data_format == FORMAT_NHWC) {
149 TF_CHECK_OK(GpuLaunchKernel(
150 MaxPoolGradBackwardNoMaskNDHWC<T>, config.block_count,
151 config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
152 output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
153 height, width, kernel_p, kernel_h, kernel_w, stride_p, stride_h,
154 stride_w, pad_p, pad_t, pad_l, top_diff, bottom_diff));
155 } else {
156 TF_CHECK_OK(GpuLaunchKernel(
157 MaxPoolGradBackwardNoMaskNCDHW<T>, config.block_count,
158 config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
159 output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
160 height, width, kernel_p, kernel_h, kernel_w, stride_p, stride_h,
161 stride_w, pad_p, pad_t, pad_l, top_diff, bottom_diff));
162 }
163 return d.ok();
164 }
165
166 } // namespace functor
167
168 #define DEFINE_GPU_SPECS(T) template struct functor::MaxPool3dGradBackward<T>;
169 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
170 #undef DEFINE_GPU_SPECS
171
172 } // namespace tensorflow
173
174 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
175