1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA
17 #define EIGEN_USE_GPU
18
19 #include <numeric>
20 #include "tensorflow/core/lib/core/status_test_util.h"
21 #include "tensorflow/core/platform/test.h"
22 #include "tensorflow/core/util/cuda_kernel_helper.h"
23 #include "tensorflow/core/util/cuda_launch_config.h"
24
25 #define CUDA_EXPECT_SUCCESS \
26 { \
27 cudaDeviceSynchronize(); \
28 cudaError_t err = cudaGetLastError(); \
29 EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
30 }
31
32 #define CUDA_ASSERT_SUCCESS \
33 { \
34 cudaDeviceSynchronize(); \
35 cudaError_t err = cudaGetLastError(); \
36 ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
37 }
38
39 namespace tensorflow {
40
41 namespace {
42
SetOutbufZero(CudaLaunchConfig config,int * outbuf)43 __global__ void SetOutbufZero(CudaLaunchConfig config, int* outbuf) {
44 CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { outbuf[x] = 0; }
45 }
46
47 // counting number of jobs by using atomic +1
Count1D(CudaLaunchConfig config,int bufsize,int * outbuf)48 __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
49 CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
50 if (x < 0) { // x might overflow when testing extreme case
51 break;
52 }
53 atomicAdd(&outbuf[x % bufsize], 1);
54 }
55 }
Count2D(Cuda2DLaunchConfig config,int bufsize,int * outbuf)56 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
57 CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
58 if (x < 0) { // x might overflow when testing extreme case
59 break;
60 }
61 CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
62 if (y < 0) { // y might overflow when testing extreme case
63 break;
64 }
65 int idx = x * config.virtual_thread_count.y + y;
66 atomicAdd(&outbuf[idx % bufsize], 1);
67 }
68 }
69 }
Count3D(Cuda3DLaunchConfig config,int bufsize,int * outbuf)70 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
71 CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
72 if (x < 0) { // x might overflow when testing extreme case
73 break;
74 }
75 CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
76 if (y < 0) { // y might overflow when testing extreme case
77 break;
78 }
79 CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {
80 if (z < 0) { // z might overflow when testing extreme case
81 break;
82 }
83 int idx =
84 x * config.virtual_thread_count.y * config.virtual_thread_count.z +
85 y * config.virtual_thread_count.z + z;
86 atomicAdd(&outbuf[idx % bufsize], 1);
87 }
88 }
89 }
90 }
91
CudaShuffleGetSrcLaneTest(unsigned * failure_count)92 __global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) {
93 unsigned lane_id = CudaLaneId();
94 for (int width = warpSize; width > 1; width /= 2) {
95 auto check_result = [&](const char* op_name, int param, unsigned actual,
96 unsigned expected) {
97 if (actual != expected) {
98 printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
99 op_name, param, width, lane_id, actual, expected);
100 CudaAtomicAdd(failure_count, 1);
101 }
102 };
103 for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
104 unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
105 unsigned expect_lane =
106 CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
107 check_result("Shuffle", src_lane, actual_lane, expect_lane);
108 }
109 for (unsigned delta = 0; delta <= warpSize; ++delta) {
110 unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
111 unsigned expect_lane =
112 CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
113 check_result("ShuffleUp", delta, actual_lane, expect_lane);
114 }
115 for (unsigned delta = 0; delta <= warpSize; ++delta) {
116 unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
117 unsigned expect_lane =
118 CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
119 check_result("ShuffleDown", delta, actual_lane, expect_lane);
120 }
121 for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
122 unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
123 unsigned expect_lane =
124 CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
125 check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
126 }
127 }
128 }
129
130 } // namespace
131
132 class CudaLaunchConfigTest : public ::testing::Test {
133 protected:
134 const int bufsize = 1024;
135 int* outbuf = nullptr;
136 Eigen::GpuStreamDevice stream;
137 Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
138
SetUp()139 virtual void SetUp() {
140 cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
141 ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
142 }
143
TearDown()144 virtual void TearDown() {
145 cudaDeviceSynchronize();
146 cudaFree(outbuf);
147 outbuf = nullptr;
148 }
149 };
150
TEST_F(CudaLaunchConfigTest,GetCudaLaunchConfig)151 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
152 CudaLaunchConfig cfg;
153
154 // test valid inputs
155 #define TEST_LAUNCH_PARAMETER(work_element_count) \
156 cfg = GetCudaLaunchConfig(bufsize, d); \
157 TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
158 cfg.thread_per_block, 0, d.stream(), cfg, \
159 outbuf)); \
160 CUDA_ASSERT_SUCCESS \
161 cfg = GetCudaLaunchConfig(work_element_count, d); \
162 TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
163 0, d.stream(), cfg, bufsize, outbuf)); \
164 CUDA_EXPECT_SUCCESS \
165 EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
166 \
167 cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
168 TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
169 cfg.thread_per_block, 0, d.stream(), cfg, \
170 outbuf)); \
171 CUDA_ASSERT_SUCCESS \
172 cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0); \
173 TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
174 0, d.stream(), cfg, bufsize, outbuf)); \
175 CUDA_EXPECT_SUCCESS \
176 EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
177
178 TEST_LAUNCH_PARAMETER(128);
179 TEST_LAUNCH_PARAMETER(129);
180 TEST_LAUNCH_PARAMETER(511);
181 TEST_LAUNCH_PARAMETER(512);
182 TEST_LAUNCH_PARAMETER(2048);
183 TEST_LAUNCH_PARAMETER(2049);
184 TEST_LAUNCH_PARAMETER(8191);
185 TEST_LAUNCH_PARAMETER(8192);
186 TEST_LAUNCH_PARAMETER(123456);
187 TEST_LAUNCH_PARAMETER(1 << 30);
188 #undef TEST_LAUNCH_PARAMETER
189 }
190
operator ==(const Cuda2DLaunchConfig & a,const Cuda2DLaunchConfig & b)191 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
192 return a.thread_per_block.x == b.thread_per_block.x &&
193 a.thread_per_block.y == b.thread_per_block.y &&
194 a.thread_per_block.z == b.thread_per_block.z &&
195 a.block_count.x == b.block_count.x &&
196 a.block_count.y == b.block_count.y &&
197 a.block_count.z == b.block_count.z &&
198 a.thread_per_block.x == b.thread_per_block.x &&
199 a.thread_per_block.y == b.thread_per_block.y &&
200 a.thread_per_block.z == b.thread_per_block.z;
201 }
202
TEST_F(CudaLaunchConfigTest,GetCuda2DLaunchConfig)203 TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
204 Cuda2DLaunchConfig cfg;
205 CudaLaunchConfig cfg1d;
206
207 // test valid inputs
208 #define TEST_LAUNCH_PARAMETER(dimx, dimy) \
209 cfg1d = GetCudaLaunchConfig(bufsize, d); \
210 TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
211 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
212 outbuf)); \
213 CUDA_ASSERT_SUCCESS \
214 cfg = GetCuda2DLaunchConfig(dimx, dimy, d); \
215 TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
216 cfg.thread_per_block, 0, d.stream(), cfg, \
217 bufsize, outbuf)); \
218 CUDA_EXPECT_SUCCESS \
219 EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
220 \
221 cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
222 TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
223 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
224 outbuf)); \
225 CUDA_ASSERT_SUCCESS \
226 cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
227 TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
228 cfg.thread_per_block, 0, d.stream(), cfg, \
229 bufsize, outbuf)); \
230 CUDA_EXPECT_SUCCESS \
231 EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
232
233 TEST_LAUNCH_PARAMETER(128, 128);
234 TEST_LAUNCH_PARAMETER(129, 64);
235 TEST_LAUNCH_PARAMETER(511, 2048);
236 TEST_LAUNCH_PARAMETER(512, 512);
237 TEST_LAUNCH_PARAMETER(2048, 1024);
238 TEST_LAUNCH_PARAMETER(2049, 32);
239 TEST_LAUNCH_PARAMETER(8191, 1);
240 TEST_LAUNCH_PARAMETER(8192, 10);
241 TEST_LAUNCH_PARAMETER(123456, 12);
242 TEST_LAUNCH_PARAMETER(1, 1 << 30);
243 TEST_LAUNCH_PARAMETER(1 << 30, 1);
244 #undef TEST_LAUNCH_PARAMETER
245 }
246
TEST_F(CudaLaunchConfigTest,GetCuda3DLaunchConfig)247 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
248 Cuda3DLaunchConfig cfg;
249 CudaLaunchConfig cfg1d;
250
251 // test valid inputs
252 #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \
253 cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
254 TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
255 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
256 outbuf)); \
257 CUDA_ASSERT_SUCCESS \
258 cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
259 TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \
260 cfg.thread_per_block, 0, d.stream(), cfg, \
261 bufsize, outbuf)); \
262 CUDA_EXPECT_SUCCESS \
263 EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
264
265 TEST_LAUNCH_PARAMETER(128, 128, 128);
266 TEST_LAUNCH_PARAMETER(129, 64, 1024);
267 TEST_LAUNCH_PARAMETER(511, 2048, 128);
268 TEST_LAUNCH_PARAMETER(512, 512, 64);
269 TEST_LAUNCH_PARAMETER(2048, 1024, 128);
270 TEST_LAUNCH_PARAMETER(2049, 32, 1024);
271 TEST_LAUNCH_PARAMETER(8191, 1, 1024);
272 TEST_LAUNCH_PARAMETER(8192, 10, 32);
273 TEST_LAUNCH_PARAMETER(123456, 12, 21);
274 TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
275 TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
276 TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
277 #undef TEST_LAUNCH_PARAMETER
278 }
279
TEST(CudaDeviceFunctionsTest,ShuffleGetSrcLane)280 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
281 unsigned* failure_count;
282 ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
283 *failure_count = 0;
284 TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
285 failure_count));
286 ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
287 ASSERT_EQ(*failure_count, 0);
288 cudaFree(failure_count);
289 }
290
291 } // namespace tensorflow
292
293 #endif // GOOGLE_CUDA
294