• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA
17 #define EIGEN_USE_GPU
18 
19 #include <numeric>
20 #include "tensorflow/core/lib/core/status_test_util.h"
21 #include "tensorflow/core/platform/test.h"
22 #include "tensorflow/core/util/cuda_kernel_helper.h"
23 #include "tensorflow/core/util/cuda_launch_config.h"
24 
25 #define CUDA_EXPECT_SUCCESS                                 \
26   {                                                         \
27     cudaDeviceSynchronize();                                \
28     cudaError_t err = cudaGetLastError();                   \
29     EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
30   }
31 
32 #define CUDA_ASSERT_SUCCESS                                 \
33   {                                                         \
34     cudaDeviceSynchronize();                                \
35     cudaError_t err = cudaGetLastError();                   \
36     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
37   }
38 
39 namespace tensorflow {
40 
41 namespace {
42 
SetOutbufZero(CudaLaunchConfig config,int * outbuf)43 __global__ void SetOutbufZero(CudaLaunchConfig config, int* outbuf) {
44   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { outbuf[x] = 0; }
45 }
46 
47 // counting number of jobs by using atomic +1
Count1D(CudaLaunchConfig config,int bufsize,int * outbuf)48 __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
49   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
50     if (x < 0) {  // x might overflow when testing extreme case
51       break;
52     }
53     atomicAdd(&outbuf[x % bufsize], 1);
54   }
55 }
Count2D(Cuda2DLaunchConfig config,int bufsize,int * outbuf)56 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
57   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
58     if (x < 0) {  // x might overflow when testing extreme case
59       break;
60     }
61     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
62       if (y < 0) {  // y might overflow when testing extreme case
63         break;
64       }
65       int idx = x * config.virtual_thread_count.y + y;
66       atomicAdd(&outbuf[idx % bufsize], 1);
67     }
68   }
69 }
Count3D(Cuda3DLaunchConfig config,int bufsize,int * outbuf)70 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
71   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
72     if (x < 0) {  // x might overflow when testing extreme case
73       break;
74     }
75     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
76       if (y < 0) {  // y might overflow when testing extreme case
77         break;
78       }
79       CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {
80         if (z < 0) {  // z might overflow when testing extreme case
81           break;
82         }
83         int idx =
84             x * config.virtual_thread_count.y * config.virtual_thread_count.z +
85             y * config.virtual_thread_count.z + z;
86         atomicAdd(&outbuf[idx % bufsize], 1);
87       }
88     }
89   }
90 }
91 
CudaShuffleGetSrcLaneTest(unsigned * failure_count)92 __global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) {
93   unsigned lane_id = CudaLaneId();
94   for (int width = warpSize; width > 1; width /= 2) {
95     auto check_result = [&](const char* op_name, int param, unsigned actual,
96                             unsigned expected) {
97       if (actual != expected) {
98         printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
99                op_name, param, width, lane_id, actual, expected);
100         CudaAtomicAdd(failure_count, 1);
101       }
102     };
103     for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
104       unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
105       unsigned expect_lane =
106           CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
107       check_result("Shuffle", src_lane, actual_lane, expect_lane);
108     }
109     for (unsigned delta = 0; delta <= warpSize; ++delta) {
110       unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
111       unsigned expect_lane =
112           CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
113       check_result("ShuffleUp", delta, actual_lane, expect_lane);
114     }
115     for (unsigned delta = 0; delta <= warpSize; ++delta) {
116       unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
117       unsigned expect_lane =
118           CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
119       check_result("ShuffleDown", delta, actual_lane, expect_lane);
120     }
121     for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
122       unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
123       unsigned expect_lane =
124           CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
125       check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
126     }
127   }
128 }
129 
130 }  // namespace
131 
132 class CudaLaunchConfigTest : public ::testing::Test {
133  protected:
134   const int bufsize = 1024;
135   int* outbuf = nullptr;
136   Eigen::GpuStreamDevice stream;
137   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
138 
SetUp()139   virtual void SetUp() {
140     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
141     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
142   }
143 
TearDown()144   virtual void TearDown() {
145     cudaDeviceSynchronize();
146     cudaFree(outbuf);
147     outbuf = nullptr;
148   }
149 };
150 
TEST_F(CudaLaunchConfigTest,GetCudaLaunchConfig)151 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
152   CudaLaunchConfig cfg;
153 
154 // test valid inputs
155 #define TEST_LAUNCH_PARAMETER(work_element_count)                              \
156   cfg = GetCudaLaunchConfig(bufsize, d);                                       \
157   TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count,                 \
158                                cfg.thread_per_block, 0, d.stream(), cfg,       \
159                                outbuf));                                       \
160   CUDA_ASSERT_SUCCESS                                                          \
161   cfg = GetCudaLaunchConfig(work_element_count, d);                            \
162   TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
163                                0, d.stream(), cfg, bufsize, outbuf));          \
164   CUDA_EXPECT_SUCCESS                                                          \
165   EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
166                                                                                \
167   cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
168   TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count,                 \
169                                cfg.thread_per_block, 0, d.stream(), cfg,       \
170                                outbuf));                                       \
171   CUDA_ASSERT_SUCCESS                                                          \
172   cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
173   TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
174                                0, d.stream(), cfg, bufsize, outbuf));          \
175   CUDA_EXPECT_SUCCESS                                                          \
176   EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
177 
178   TEST_LAUNCH_PARAMETER(128);
179   TEST_LAUNCH_PARAMETER(129);
180   TEST_LAUNCH_PARAMETER(511);
181   TEST_LAUNCH_PARAMETER(512);
182   TEST_LAUNCH_PARAMETER(2048);
183   TEST_LAUNCH_PARAMETER(2049);
184   TEST_LAUNCH_PARAMETER(8191);
185   TEST_LAUNCH_PARAMETER(8192);
186   TEST_LAUNCH_PARAMETER(123456);
187   TEST_LAUNCH_PARAMETER(1 << 30);
188 #undef TEST_LAUNCH_PARAMETER
189 }
190 
operator ==(const Cuda2DLaunchConfig & a,const Cuda2DLaunchConfig & b)191 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
192   return a.thread_per_block.x == b.thread_per_block.x &&
193          a.thread_per_block.y == b.thread_per_block.y &&
194          a.thread_per_block.z == b.thread_per_block.z &&
195          a.block_count.x == b.block_count.x &&
196          a.block_count.y == b.block_count.y &&
197          a.block_count.z == b.block_count.z &&
198          a.thread_per_block.x == b.thread_per_block.x &&
199          a.thread_per_block.y == b.thread_per_block.y &&
200          a.thread_per_block.z == b.thread_per_block.z;
201 }
202 
TEST_F(CudaLaunchConfigTest,GetCuda2DLaunchConfig)203 TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
204   Cuda2DLaunchConfig cfg;
205   CudaLaunchConfig cfg1d;
206 
207 // test valid inputs
208 #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
209   cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
210   TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
211                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
212                                 outbuf));                                     \
213   CUDA_ASSERT_SUCCESS                                                         \
214   cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
215   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
216                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
217                                 bufsize, outbuf));                            \
218   CUDA_EXPECT_SUCCESS                                                         \
219   EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));        \
220                                                                               \
221   cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
222   TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
223                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
224                                 outbuf));                                     \
225   CUDA_ASSERT_SUCCESS                                                         \
226   cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
227   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
228                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
229                                 bufsize, outbuf));                            \
230   CUDA_EXPECT_SUCCESS                                                         \
231   EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
232 
233   TEST_LAUNCH_PARAMETER(128, 128);
234   TEST_LAUNCH_PARAMETER(129, 64);
235   TEST_LAUNCH_PARAMETER(511, 2048);
236   TEST_LAUNCH_PARAMETER(512, 512);
237   TEST_LAUNCH_PARAMETER(2048, 1024);
238   TEST_LAUNCH_PARAMETER(2049, 32);
239   TEST_LAUNCH_PARAMETER(8191, 1);
240   TEST_LAUNCH_PARAMETER(8192, 10);
241   TEST_LAUNCH_PARAMETER(123456, 12);
242   TEST_LAUNCH_PARAMETER(1, 1 << 30);
243   TEST_LAUNCH_PARAMETER(1 << 30, 1);
244 #undef TEST_LAUNCH_PARAMETER
245 }
246 
TEST_F(CudaLaunchConfigTest,GetCuda3DLaunchConfig)247 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
248   Cuda3DLaunchConfig cfg;
249   CudaLaunchConfig cfg1d;
250 
251 // test valid inputs
252 #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
253   cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
254   TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
255                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
256                                 outbuf));                                     \
257   CUDA_ASSERT_SUCCESS                                                         \
258   cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
259   TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count,                     \
260                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
261                                 bufsize, outbuf));                            \
262   CUDA_EXPECT_SUCCESS                                                         \
263   EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
264 
265   TEST_LAUNCH_PARAMETER(128, 128, 128);
266   TEST_LAUNCH_PARAMETER(129, 64, 1024);
267   TEST_LAUNCH_PARAMETER(511, 2048, 128);
268   TEST_LAUNCH_PARAMETER(512, 512, 64);
269   TEST_LAUNCH_PARAMETER(2048, 1024, 128);
270   TEST_LAUNCH_PARAMETER(2049, 32, 1024);
271   TEST_LAUNCH_PARAMETER(8191, 1, 1024);
272   TEST_LAUNCH_PARAMETER(8192, 10, 32);
273   TEST_LAUNCH_PARAMETER(123456, 12, 21);
274   TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
275   TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
276   TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
277 #undef TEST_LAUNCH_PARAMETER
278 }
279 
TEST(CudaDeviceFunctionsTest,ShuffleGetSrcLane)280 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
281   unsigned* failure_count;
282   ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
283   *failure_count = 0;
284   TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
285                                 failure_count));
286   ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
287   ASSERT_EQ(*failure_count, 0);
288   cudaFree(failure_count);
289 }
290 
291 }  // namespace tensorflow
292 
293 #endif  // GOOGLE_CUDA
294