1 #include <gtest/gtest.h>
2
3 // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or
4 // (b-1), so it takes a few minutes to run.
5
6 #include <assert.h>
7 #include <stdint.h>
8 #include <memory>
9 #include <vector>
10
11 #include <ATen/cuda/CUDAContext.h>
12 #include <ATen/cuda/detail/IntegerDivider.cuh>
13
14 using std::vector;
15 using at::cuda::detail::IntDivider;
16 using at::cuda::detail::DivMod;
17
18 template<typename Value>
19 struct TestCase {
20 Value dividend;
21 int divisor_idx;
22 int steps;
23
TestCaseTestCase24 TestCase(Value dividend, int divisor_idx, int steps)
25 : dividend(dividend), divisor_idx(divisor_idx), steps(steps) {}
26 };
27
28 template <typename Value>
testIntDivider(const IntDivider<Value> * dividers,const TestCase<Value> * testCases,int numCases)29 __global__ void testIntDivider(
30 const IntDivider<Value>* dividers,
31 const TestCase<Value>* testCases,
32 int numCases) {
33 int index = blockIdx.x * blockDim.x + threadIdx.x;
34 int stride = blockDim.x * gridDim.x;
35 for (int i = index; i < numCases; i += stride) {
36 const TestCase<Value>& tc = testCases[i];
37 Value dividend = tc.dividend;
38 const IntDivider<Value>& divider = dividers[tc.divisor_idx];
39 Value divisor = divider.divisor;
40
41 for (int j = 0; j < tc.steps; j++) {
42 if (sizeof(Value) == 4 && dividend > INT32_MAX)
43 return;
44
45 DivMod<Value> qr = divider.divmod(dividend);
46 assert(qr.div == dividend / divisor && qr.mod == dividend % divisor);
47 dividend += divisor;
48 }
49 }
50 }
51
52 enum {
53 // Number of test cases per each kernel invocation.
54 NUM_CASES = 1000000,
55
56 // Maximum number of steps per each test case.
57 MAX_STEPS = 10000,
58 };
59
60 // Test the magic division algorithm.
61 template<typename Value>
62 class IntDividerTester {
63 public:
IntDividerTester()64 IntDividerTester() {
65 cudaError_t err;
66
67 err = cudaMalloc(÷rsBuf_, NUM_CASES * sizeof(IntDivider<Value>));
68 bool isEQ = err == cudaSuccess;
69 EXPECT_TRUE(isEQ);
70 err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase<Value>));
71 isEQ = err == cudaSuccess;
72 EXPECT_TRUE(isEQ);
73 }
74
~IntDividerTester()75 ~IntDividerTester() {
76 cudaError_t err;
77
78 err = cudaFree(dividersBuf_);
79 bool isEQ = err == cudaSuccess;
80 EXPECT_TRUE(isEQ);
81 err = cudaFree(testCasesBuf_);
82 isEQ = err == cudaSuccess;
83 EXPECT_TRUE(isEQ);
84 }
85
addTestCase(Value dividend,Value divisor,int steps)86 void addTestCase(Value dividend, Value divisor, int steps) {
87 // Append a new IntDivider using 'divisor' if necessary.
88 if (dividers_.empty() || dividers_.back().divisor != divisor)
89 dividers_.emplace_back(divisor);
90
91 // Append the test case.
92 testCases_.emplace_back(dividend, dividers_.size() - 1, steps);
93
94 // Launch the test kernel if the buffer is full.
95 if (testCases_.size() == NUM_CASES)
96 flush();
97 }
98
flush()99 void flush() {
100 cudaError_t err;
101 bool isTrue;
102 if (testCases_.empty())
103 return;
104
105 ASSERT_FALSE(dividers_.empty());
106
107 isTrue = dividers_.size() <= NUM_CASES;
108 ASSERT_TRUE(isTrue);
109 isTrue = testCases_.size() <= NUM_CASES;
110 ASSERT_TRUE(isTrue);
111 err = cudaMemcpy(
112 dividersBuf_,
113 dividers_.data(),
114 dividers_.size() * sizeof(IntDivider<Value>),
115 cudaMemcpyHostToDevice);
116 isTrue = err == cudaSuccess;
117 ASSERT_TRUE(isTrue);
118 err = cudaMemcpy(
119 testCasesBuf_,
120 testCases_.data(),
121 testCases_.size() * sizeof(TestCase<Value>),
122 cudaMemcpyHostToDevice);
123 isTrue = err == cudaSuccess;
124 ASSERT_TRUE(isTrue);
125
126 int numCases = testCases_.size();
127 testIntDivider<Value><<<512, 512>>>(dividersBuf_, testCasesBuf_, numCases);
128 C10_CUDA_KERNEL_LAUNCH_CHECK();
129
130 dividers_.clear();
131 testCases_.clear();
132 }
133
134 private:
135 vector<IntDivider<Value>> dividers_;
136 vector<TestCase<Value>> testCases_;
137
138 IntDivider<Value>* dividersBuf_;
139 TestCase<Value>* testCasesBuf_;
140 };
141
testUint32Divider()142 static void testUint32Divider()
143 {
144 fprintf(stderr, "Testing 32-bit integer division ...");
145
146 IntDividerTester<uint32_t> tester;
147
148 for (uint64_t divisor = 1; divisor <= INT32_MAX; divisor++) {
149 if (divisor < 1000000 && divisor % 10000 == 0)
150 fprintf(stderr, ".");
151 if (divisor % 10000000 == 0)
152 fprintf(stderr, "-");
153
154 // In order to save time, we only test when the remainder is zero or
155 // (divisor - 1).
156 uint64_t dividend = 0;
157 while (dividend <= INT32_MAX) {
158 uint64_t steps = (INT32_MAX - dividend) / divisor + 1;
159 if (steps > MAX_STEPS)
160 steps = MAX_STEPS;
161
162 tester.addTestCase(dividend, divisor, steps);
163 tester.addTestCase(dividend + divisor - 1, divisor, steps);
164
165 dividend += divisor * steps;
166 }
167
168 // Check the boundary cases.
169 tester.addTestCase(1, divisor, 1);
170 tester.addTestCase(INT32_MAX, divisor, 1);
171 }
172
173 tester.flush();
174
175 fprintf(stderr, " Done!\n");
176 }
177
178 // uint64_t divider uses plain division, so we just check a few random cases.
testUint64Divider()179 static void testUint64Divider()
180 {
181 IntDividerTester<uint64_t> tester;
182
183 uint64_t dividend = 0x123456789ULL;
184 uint64_t divisor = 0x54321ULL;
185
186 for (int i = 0; i < 1000; i++) {
187 if (divisor != 0) {
188 tester.addTestCase(dividend, divisor, 100);
189
190 // Test small divisor.
191 tester.addTestCase(dividend, divisor % 65536, 100);
192
193 // Create pseudorandom numbers.
194 dividend *= 0x100000001b3ULL;
195 dividend ^= 0x1234567890abcdefULL;
196 divisor *= 0x100000001b3ULL;
197 divisor ^= 0x1234567890abcdefULL;
198 }
199 }
200
201 tester.flush();
202 }
203
TEST(TestCUDAIntegerDivider,IntegerDivider)204 TEST(TestCUDAIntegerDivider, IntegerDivider) {
205 if (!at::cuda::is_available()) return;
206 testUint64Divider();
207 testUint32Divider();
208
209 cudaError_t err = cudaDeviceSynchronize();
210 bool isTrue = err == cudaSuccess;
211 ASSERT_TRUE(isTrue);
212 }
213