1 #include <gtest/gtest.h>
2 #include <ATen/cuda/Atomic.cuh>
3 #include <c10/test/util/Macros.h>
4 #include <ATen/cuda/CUDAContext.h>
5 #include <c10/cuda/CUDAException.h>
6
7 #include <cmath>
8
9 constexpr int blocksize = 256;
10 constexpr int factor = 4;
11 constexpr int arraysize = blocksize / factor;
12
13 template <typename T>
addition_test_kernel(T * a,T * sum)14 __global__ void addition_test_kernel(T * a, T * sum) {
15 int tid = blockIdx.x * blockDim.x + threadIdx.x;
16 int idx = (tid) % arraysize;
17
18 gpuAtomicAdd(&sum[idx], a[idx]);
19 }
20
21 template <typename T>
mul_test_kernel(T * a,T * sum)22 __global__ void mul_test_kernel(T * a, T * sum) {
23 int tid = blockIdx.x * blockDim.x + threadIdx.x;
24 int idx = (tid) % arraysize;
25
26 gpuAtomicMul(&sum[idx], a[idx]);
27 }
28
29 template <typename T>
max_test_kernel(T * a,T * max)30 __global__ void max_test_kernel(T * a, T * max) {
31 int tid = blockIdx.x * blockDim.x + threadIdx.x;
32 int a_idx = (tid) % (arraysize * factor);
33 int idx = a_idx / factor;
34
35 gpuAtomicMax(&max[idx], a[a_idx]);
36 }
37
38 template <typename T>
min_test_kernel(T * a,T * min)39 __global__ void min_test_kernel(T * a, T * min) {
40 int tid = blockIdx.x * blockDim.x + threadIdx.x;
41 int a_idx = (tid) % (arraysize * factor);
42 int idx = a_idx / factor;
43
44 gpuAtomicMin(&min[idx], a[a_idx]);
45 }
46
47 template <typename T>
test_atomic_add()48 void test_atomic_add() {
49 dim3 dimBlock(blocksize, 1);
50 dim3 dimGrid(1, 1);
51
52 T *ad, *sumd;
53
54 std::vector<T> a(arraysize);
55 std::vector<T> sum(arraysize);
56 std::vector<T> answer(arraysize);
57
58 for (int i = 0; i < arraysize; ++i) {
59 a[i] = 1;
60 sum[i] = 0;
61 answer[i] = factor;
62 }
63
64 cudaMalloc((void**)&ad, arraysize * sizeof(T));
65 cudaMalloc((void**)&sumd, arraysize * sizeof(T));
66
67 cudaMemcpy(ad, a.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
68 cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
69
70 addition_test_kernel<<<dimGrid, dimBlock>>>(ad, sumd);
71 C10_CUDA_KERNEL_LAUNCH_CHECK();
72
73 cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost);
74
75 for (int i = 0; i < arraysize; ++i) {
76 ASSERT_EQ(sum[i], answer[i]) << typeid(T).name();
77 }
78
79 cudaFree(ad);
80 cudaFree(sumd);
81 }
82
83 template <typename T>
test_atomic_mul()84 void test_atomic_mul() {
85 dim3 dimBlock(blocksize, 1);
86 dim3 dimGrid(1, 1);
87
88 T *ad, *sumd;
89
90 std::vector<T> a(arraysize);
91 std::vector<T> sum(arraysize);
92 std::vector<T> answer(arraysize);
93
94 for (int i = 0; i < arraysize; ++i) {
95 a[i] = 2;
96 sum[i] = 2;
97 answer[i] = pow(sum[i], static_cast<T>(factor + 1));
98 }
99
100 cudaMalloc((void**)&ad, arraysize * sizeof(T));
101 cudaMalloc((void**)&sumd, arraysize * sizeof(T));
102
103 cudaMemcpy(ad, a.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
104 cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
105
106 mul_test_kernel<<<dimGrid, dimBlock>>>(ad, sumd);
107 C10_CUDA_KERNEL_LAUNCH_CHECK();
108
109 cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost);
110
111 for (int i = 0; i < arraysize; ++i) {
112 ASSERT_EQ(sum[i], answer[i]) << typeid(T).name();
113 }
114
115 cudaFree(ad);
116 cudaFree(sumd);
117 }
118
119 template <typename T>
test_atomic_max()120 void test_atomic_max() {
121 dim3 dimBlock(blocksize, 1);
122 dim3 dimGrid(1, 1);
123
124 T *ad, *sumd;
125
126 std::vector<T> a(arraysize * factor);
127 std::vector<T> sum(arraysize);
128 std::vector<T> answer(arraysize);
129
130 int j;
131 for (int i = 0; i < arraysize * factor; ++i) {
132 a[i] = i;
133 if (i % factor == 0) {
134 j = i / factor;
135 sum[j] = std::numeric_limits<T>::lowest();
136 answer[j] = (j + 1) * factor - 1;
137 }
138 }
139
140 cudaMalloc((void**)&ad, arraysize * factor * sizeof(T));
141 cudaMalloc((void**)&sumd, arraysize * sizeof(T));
142
143 cudaMemcpy(ad, a.data(), arraysize * factor * sizeof(T), cudaMemcpyHostToDevice);
144 cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
145
146 max_test_kernel<<<dimGrid, dimBlock>>>(ad, sumd);
147 C10_CUDA_KERNEL_LAUNCH_CHECK();
148
149 cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost);
150
151 for (int i = 0; i < arraysize; ++i) {
152 ASSERT_EQ(sum[i], answer[i]) << typeid(T).name();
153 }
154
155 cudaFree(ad);
156 cudaFree(sumd);
157 }
158
159 template <typename T>
test_atomic_min()160 void test_atomic_min() {
161 dim3 dimBlock(blocksize, 1);
162 dim3 dimGrid(1, 1);
163
164 T *ad, *sumd;
165
166 std::vector<T> a(arraysize * factor);
167 std::vector<T> sum(arraysize);
168 std::vector<T> answer(arraysize);
169
170 int j;
171 for (int i = 0; i < arraysize * factor; ++i) {
172 a[i] = i;
173 if (i % factor == 0) {
174 j = i / factor;
175 sum[j] = std::numeric_limits<T>::max();
176 answer[j] = j * factor;
177 }
178 }
179
180 cudaMalloc((void**)&ad, arraysize * factor * sizeof(T));
181 cudaMalloc((void**)&sumd, arraysize * sizeof(T));
182
183 cudaMemcpy(ad, a.data(), arraysize * factor * sizeof(T), cudaMemcpyHostToDevice);
184 cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
185
186 min_test_kernel<<<dimGrid, dimBlock>>>(ad, sumd);
187 C10_CUDA_KERNEL_LAUNCH_CHECK();
188
189 cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost);
190
191 for (int i = 0; i < arraysize; ++i) {
192 ASSERT_EQ(sum[i], answer[i]) << typeid(T).name();
193 }
194
195 cudaFree(ad);
196 cudaFree(sumd);
197 }
198
TEST(TestAtomicOps,TestAtomicAdd)199 TEST(TestAtomicOps, TestAtomicAdd) {
200 if (!at::cuda::is_available()) return;
201 test_atomic_add<uint8_t>();
202 test_atomic_add<int8_t>();
203 test_atomic_add<int16_t>();
204 test_atomic_add<int32_t>();
205 test_atomic_add<int64_t>();
206
207 test_atomic_add<at::BFloat16>();
208 test_atomic_add<at::Half>();
209 test_atomic_add<float>();
210 test_atomic_add<double>();
211 test_atomic_add<c10::complex<float> >();
212 test_atomic_add<c10::complex<double> >();
213 }
214
TEST(TestAtomicOps,DISABLED_ON_WINDOWS (TestAtomicMul))215 TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMul)) {
216 if (!at::cuda::is_available()) return;
217 test_atomic_mul<uint8_t>();
218 test_atomic_mul<int8_t>();
219 test_atomic_mul<int16_t>();
220 test_atomic_mul<int32_t>();
221 test_atomic_mul<int64_t>();
222 test_atomic_mul<at::BFloat16>();
223 test_atomic_mul<at::Half>();
224 test_atomic_mul<float>();
225 test_atomic_mul<double>();
226 }
227
TEST(TestAtomicOps,DISABLED_ON_WINDOWS (TestAtomicMax))228 TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMax)) {
229 if (!at::cuda::is_available()) return;
230 test_atomic_max<uint8_t>();
231 test_atomic_max<int8_t>();
232 test_atomic_max<int16_t>();
233 test_atomic_max<int32_t>();
234 test_atomic_max<int64_t>();
235 test_atomic_max<at::BFloat16>();
236 test_atomic_max<at::Half>();
237 test_atomic_max<float>();
238 test_atomic_max<double>();
239 }
240
TEST(TestAtomicOps,DISABLED_ON_WINDOWS (TestAtomicMin))241 TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMin)) {
242 if (!at::cuda::is_available()) return;
243 test_atomic_min<uint8_t>();
244 test_atomic_min<int8_t>();
245 test_atomic_min<int16_t>();
246 test_atomic_min<int32_t>();
247 test_atomic_min<int64_t>();
248 test_atomic_min<at::BFloat16>();
249 test_atomic_min<at::Half>();
250 test_atomic_min<float>();
251 test_atomic_min<double>();
252 }
253