1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
14 #define EIGEN_USE_GPU
15
16 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
17 #include <cuda_fp16.h>
18 #endif
19 #include "main.h"
20 #include <unsupported/Eigen/CXX11/Tensor>
21
22 using Eigen::Tensor;
23
24 template<typename>
test_cuda_numext()25 void test_cuda_numext() {
26 Eigen::CudaStreamDevice stream;
27 Eigen::GpuDevice gpu_device(&stream);
28 int num_elem = 101;
29
30 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
31 bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
32 bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
33
34 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
35 d_float, num_elem);
36 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
37 d_res_half, num_elem);
38 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
39 d_res_float, num_elem);
40
41 gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
42 gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
43 gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
44
45 Tensor<bool, 1> half_prec(num_elem);
46 Tensor<bool, 1> full_prec(num_elem);
47 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
48 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
49 gpu_device.synchronize();
50
51 for (int i = 0; i < num_elem; ++i) {
52 std::cout << "Checking numext " << i << std::endl;
53 VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
54 }
55
56 gpu_device.deallocate(d_float);
57 gpu_device.deallocate(d_res_half);
58 gpu_device.deallocate(d_res_float);
59 }
60
61
62 #ifdef EIGEN_HAS_CUDA_FP16
63
64 template<typename>
test_cuda_conversion()65 void test_cuda_conversion() {
66 Eigen::CudaStreamDevice stream;
67 Eigen::GpuDevice gpu_device(&stream);
68 int num_elem = 101;
69
70 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
71 Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
72 float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
73
74 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
75 d_float, num_elem);
76 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
77 d_half, num_elem);
78 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
79 d_conv, num_elem);
80
81 gpu_float.device(gpu_device) = gpu_float.random();
82 gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
83 gpu_conv.device(gpu_device) = gpu_half.cast<float>();
84
85 Tensor<float, 1> initial(num_elem);
86 Tensor<float, 1> final(num_elem);
87 gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
88 gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
89
90 for (int i = 0; i < num_elem; ++i) {
91 VERIFY_IS_APPROX(initial(i), final(i));
92 }
93
94 gpu_device.deallocate(d_float);
95 gpu_device.deallocate(d_half);
96 gpu_device.deallocate(d_conv);
97 }
98
99 template<typename>
test_cuda_unary()100 void test_cuda_unary() {
101 Eigen::CudaStreamDevice stream;
102 Eigen::GpuDevice gpu_device(&stream);
103 int num_elem = 101;
104
105 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
106 float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
107 float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
108
109 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
110 d_float, num_elem);
111 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
112 d_res_half, num_elem);
113 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
114 d_res_float, num_elem);
115
116 gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
117 gpu_res_float.device(gpu_device) = gpu_float.abs();
118 gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
119
120 Tensor<float, 1> half_prec(num_elem);
121 Tensor<float, 1> full_prec(num_elem);
122 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
123 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
124 gpu_device.synchronize();
125
126 for (int i = 0; i < num_elem; ++i) {
127 std::cout << "Checking unary " << i << std::endl;
128 VERIFY_IS_APPROX(full_prec(i), half_prec(i));
129 }
130
131 gpu_device.deallocate(d_float);
132 gpu_device.deallocate(d_res_half);
133 gpu_device.deallocate(d_res_float);
134 }
135
136 template<typename>
test_cuda_elementwise()137 void test_cuda_elementwise() {
138 Eigen::CudaStreamDevice stream;
139 Eigen::GpuDevice gpu_device(&stream);
140 int num_elem = 101;
141
142 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
143 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
144 float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
145 float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
146
147 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
148 d_float1, num_elem);
149 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
150 d_float2, num_elem);
151 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
152 d_res_half, num_elem);
153 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
154 d_res_float, num_elem);
155
156 gpu_float1.device(gpu_device) = gpu_float1.random();
157 gpu_float2.device(gpu_device) = gpu_float2.random();
158 gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
159 gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
160
161 Tensor<float, 1> half_prec(num_elem);
162 Tensor<float, 1> full_prec(num_elem);
163 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
164 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
165 gpu_device.synchronize();
166
167 for (int i = 0; i < num_elem; ++i) {
168 std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
169 VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
170 }
171
172 gpu_device.deallocate(d_float1);
173 gpu_device.deallocate(d_float2);
174 gpu_device.deallocate(d_res_half);
175 gpu_device.deallocate(d_res_float);
176 }
177
178 template<typename>
test_cuda_trancendental()179 void test_cuda_trancendental() {
180 Eigen::CudaStreamDevice stream;
181 Eigen::GpuDevice gpu_device(&stream);
182 int num_elem = 101;
183
184 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
185 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
186 float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
187 Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
188 Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
189 Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
190 Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
191 Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
192 Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
193
194 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
195 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
196 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
197 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
198 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
199 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
200 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
201 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
202 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
203
204 gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
205 gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
206 gpu_float3.device(gpu_device) = gpu_float3.random();
207 gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
208 gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
209 gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
210
211 gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
212 gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
213
214 gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
215 gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
216
217 gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
218 gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
219
220 Tensor<float, 1> input1(num_elem);
221 Tensor<Eigen::half, 1> half_prec1(num_elem);
222 Tensor<Eigen::half, 1> full_prec1(num_elem);
223 Tensor<float, 1> input2(num_elem);
224 Tensor<Eigen::half, 1> half_prec2(num_elem);
225 Tensor<Eigen::half, 1> full_prec2(num_elem);
226 Tensor<float, 1> input3(num_elem);
227 Tensor<Eigen::half, 1> half_prec3(num_elem);
228 Tensor<Eigen::half, 1> full_prec3(num_elem);
229 gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
230 gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
231 gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
232 gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
233 gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
234 gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
235 gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
236 gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
237 gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
238 gpu_device.synchronize();
239
240 for (int i = 0; i < num_elem; ++i) {
241 std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
242 VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
243 }
244 for (int i = 0; i < num_elem; ++i) {
245 std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
246 if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
247 VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
248 else
249 VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
250 }
251 for (int i = 0; i < num_elem; ++i) {
252 std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
253 VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
254 }
255 gpu_device.deallocate(d_float1);
256 gpu_device.deallocate(d_float2);
257 gpu_device.deallocate(d_float3);
258 gpu_device.deallocate(d_res1_half);
259 gpu_device.deallocate(d_res1_float);
260 gpu_device.deallocate(d_res2_half);
261 gpu_device.deallocate(d_res2_float);
262 gpu_device.deallocate(d_res3_float);
263 gpu_device.deallocate(d_res3_half);
264 }
265
266 template<typename>
test_cuda_contractions()267 void test_cuda_contractions() {
268 Eigen::CudaStreamDevice stream;
269 Eigen::GpuDevice gpu_device(&stream);
270 int rows = 23;
271 int cols = 23;
272 int num_elem = rows*cols;
273
274 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
275 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
276 Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
277 Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
278
279 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
280 d_float1, rows, cols);
281 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
282 d_float2, rows, cols);
283 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
284 d_res_half, rows, cols);
285 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
286 d_res_float, rows, cols);
287
288 gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
289 gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
290
291 typedef Tensor<float, 2>::DimensionPair DimPair;
292 Eigen::array<DimPair, 1> dims(DimPair(1, 0));
293 gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
294 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
295
296 Tensor<Eigen::half, 2> half_prec(rows, cols);
297 Tensor<Eigen::half, 2> full_prec(rows, cols);
298 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
299 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
300 gpu_device.synchronize();
301
302 for (int i = 0; i < rows; ++i) {
303 for (int j = 0; j < cols; ++j) {
304 std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
305 if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
306 VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
307 }
308 }
309 }
310
311 gpu_device.deallocate(d_float1);
312 gpu_device.deallocate(d_float2);
313 gpu_device.deallocate(d_res_half);
314 gpu_device.deallocate(d_res_float);
315 }
316
317 template<typename>
test_cuda_reductions(int size1,int size2,int redux)318 void test_cuda_reductions(int size1, int size2, int redux) {
319
320 std::cout << "Reducing " << size1 << " by " << size2
321 << " tensor along dim " << redux << std::endl;
322
323 Eigen::CudaStreamDevice stream;
324 Eigen::GpuDevice gpu_device(&stream);
325 int num_elem = size1*size2;
326 int result_size = (redux == 1 ? size1 : size2);
327
328 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
329 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
330 Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
331 Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
332
333 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
334 d_float1, size1, size2);
335 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
336 d_float2, size1, size2);
337 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
338 d_res_half, result_size);
339 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
340 d_res_float, result_size);
341
342 gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
343 gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
344
345 Eigen::array<int, 1> redux_dim = {{redux}};
346 gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
347 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
348
349 Tensor<Eigen::half, 1> half_prec(result_size);
350 Tensor<Eigen::half, 1> full_prec(result_size);
351 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
352 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
353 gpu_device.synchronize();
354
355 for (int i = 0; i < result_size; ++i) {
356 std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
357 VERIFY_IS_APPROX(full_prec(i), half_prec(i));
358 }
359
360 gpu_device.deallocate(d_float1);
361 gpu_device.deallocate(d_float2);
362 gpu_device.deallocate(d_res_half);
363 gpu_device.deallocate(d_res_float);
364 }
365
366 template<typename>
test_cuda_reductions()367 void test_cuda_reductions() {
368 test_cuda_reductions<void>(13, 13, 0);
369 test_cuda_reductions<void>(13, 13, 1);
370
371 test_cuda_reductions<void>(35, 36, 0);
372 test_cuda_reductions<void>(35, 36, 1);
373
374 test_cuda_reductions<void>(36, 35, 0);
375 test_cuda_reductions<void>(36, 35, 1);
376 }
377
378 template<typename>
test_cuda_full_reductions()379 void test_cuda_full_reductions() {
380 Eigen::CudaStreamDevice stream;
381 Eigen::GpuDevice gpu_device(&stream);
382 int size = 13;
383 int num_elem = size*size;
384
385 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
386 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
387 Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
388 Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
389
390 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
391 d_float1, size, size);
392 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
393 d_float2, size, size);
394 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
395 d_res_half);
396 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
397 d_res_float);
398
399 gpu_float1.device(gpu_device) = gpu_float1.random();
400 gpu_float2.device(gpu_device) = gpu_float2.random();
401
402 gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
403 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
404
405 Tensor<Eigen::half, 0> half_prec;
406 Tensor<Eigen::half, 0> full_prec;
407 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
408 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
409 gpu_device.synchronize();
410
411 VERIFY_IS_APPROX(full_prec(), half_prec());
412
413 gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
414 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
415 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
416 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
417 gpu_device.synchronize();
418
419 VERIFY_IS_APPROX(full_prec(), half_prec());
420
421 gpu_device.deallocate(d_float1);
422 gpu_device.deallocate(d_float2);
423 gpu_device.deallocate(d_res_half);
424 gpu_device.deallocate(d_res_float);
425 }
426
427 template<typename>
test_cuda_forced_evals()428 void test_cuda_forced_evals() {
429
430 Eigen::CudaStreamDevice stream;
431 Eigen::GpuDevice gpu_device(&stream);
432 int num_elem = 101;
433
434 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
435 float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
436 float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
437 float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
438
439 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
440 d_float, num_elem);
441 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
442 d_res_half1, num_elem);
443 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
444 d_res_half2, num_elem);
445 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
446 d_res_float, num_elem);
447
448 Eigen::array<int, 1> no_bcast;
449 no_bcast[0] = 1;
450
451 gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
452 gpu_res_float.device(gpu_device) = gpu_float.abs();
453 gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
454 gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
455
456 Tensor<float, 1> half_prec1(num_elem);
457 Tensor<float, 1> half_prec2(num_elem);
458 Tensor<float, 1> full_prec(num_elem);
459 gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
460 gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
461 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
462 gpu_device.synchronize();
463
464 for (int i = 0; i < num_elem; ++i) {
465 std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
466 VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
467 VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
468 }
469
470 gpu_device.deallocate(d_float);
471 gpu_device.deallocate(d_res_half1);
472 gpu_device.deallocate(d_res_half2);
473 gpu_device.deallocate(d_res_float);
474 }
475 #endif
476
477
test_cxx11_tensor_of_float16_cuda()478 void test_cxx11_tensor_of_float16_cuda()
479 {
480 CALL_SUBTEST_1(test_cuda_numext<void>());
481
482 #ifdef EIGEN_HAS_CUDA_FP16
483 CALL_SUBTEST_1(test_cuda_conversion<void>());
484 CALL_SUBTEST_1(test_cuda_unary<void>());
485 CALL_SUBTEST_1(test_cuda_elementwise<void>());
486 CALL_SUBTEST_1(test_cuda_trancendental<void>());
487 CALL_SUBTEST_2(test_cuda_contractions<void>());
488 CALL_SUBTEST_3(test_cuda_reductions<void>());
489 CALL_SUBTEST_4(test_cuda_full_reductions<void>());
490 CALL_SUBTEST_5(test_cuda_forced_evals<void>());
491 #else
492 std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
493 #endif
494 }
495