1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11
12 #define EIGEN_USE_GPU
13
14 #include "main.h"
15 #include <unsupported/Eigen/CXX11/Tensor>
16
17 using Eigen::Tensor;
18
test_cuda_nullary()19 void test_cuda_nullary() {
20 Tensor<std::complex<float>, 1, 0, int> in1(2);
21 Tensor<std::complex<float>, 1, 0, int> in2(2);
22 in1.setRandom();
23 in2.setRandom();
24
25 std::size_t float_bytes = in1.size() * sizeof(float);
26 std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
27
28 std::complex<float>* d_in1;
29 std::complex<float>* d_in2;
30 float* d_out2;
31 cudaMalloc((void**)(&d_in1), complex_bytes);
32 cudaMalloc((void**)(&d_in2), complex_bytes);
33 cudaMalloc((void**)(&d_out2), float_bytes);
34 cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
35 cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
36
37 Eigen::GpuStreamDevice stream;
38 Eigen::GpuDevice gpu_device(&stream);
39
40 Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
41 d_in1, 2);
42 Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
43 d_in2, 2);
44 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
45 d_out2, 2);
46
47 gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
48 gpu_out2.device(gpu_device) = gpu_in2.abs();
49
50 Tensor<std::complex<float>, 1, 0, int> new1(2);
51 Tensor<float, 1, 0, int> new2(2);
52
53 assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
54 gpu_device.stream()) == cudaSuccess);
55 assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
56 gpu_device.stream()) == cudaSuccess);
57
58 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
59
60 for (int i = 0; i < 2; ++i) {
61 VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
62 VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
63 }
64
65 cudaFree(d_in1);
66 cudaFree(d_in2);
67 cudaFree(d_out2);
68 }
69
70
test_cuda_sum_reductions()71 static void test_cuda_sum_reductions() {
72
73 Eigen::GpuStreamDevice stream;
74 Eigen::GpuDevice gpu_device(&stream);
75
76 const int num_rows = internal::random<int>(1024, 5*1024);
77 const int num_cols = internal::random<int>(1024, 5*1024);
78
79 Tensor<std::complex<float>, 2> in(num_rows, num_cols);
80 in.setRandom();
81
82 Tensor<std::complex<float>, 0> full_redux;
83 full_redux = in.sum();
84
85 std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
86 std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
87 std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
88 std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
89 gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
90
91 TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
92 TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
93
94 out_gpu.device(gpu_device) = in_gpu.sum();
95
96 Tensor<std::complex<float>, 0> full_redux_gpu;
97 gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
98 gpu_device.synchronize();
99
100 // Check that the CPU and GPU reductions return the same result.
101 VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
102
103 gpu_device.deallocate(gpu_in_ptr);
104 gpu_device.deallocate(gpu_out_ptr);
105 }
106
test_cuda_mean_reductions()107 static void test_cuda_mean_reductions() {
108
109 Eigen::GpuStreamDevice stream;
110 Eigen::GpuDevice gpu_device(&stream);
111
112 const int num_rows = internal::random<int>(1024, 5*1024);
113 const int num_cols = internal::random<int>(1024, 5*1024);
114
115 Tensor<std::complex<float>, 2> in(num_rows, num_cols);
116 in.setRandom();
117
118 Tensor<std::complex<float>, 0> full_redux;
119 full_redux = in.mean();
120
121 std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
122 std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
123 std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
124 std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
125 gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
126
127 TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
128 TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
129
130 out_gpu.device(gpu_device) = in_gpu.mean();
131
132 Tensor<std::complex<float>, 0> full_redux_gpu;
133 gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
134 gpu_device.synchronize();
135
136 // Check that the CPU and GPU reductions return the same result.
137 VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
138
139 gpu_device.deallocate(gpu_in_ptr);
140 gpu_device.deallocate(gpu_out_ptr);
141 }
142
test_cuda_product_reductions()143 static void test_cuda_product_reductions() {
144
145 Eigen::GpuStreamDevice stream;
146 Eigen::GpuDevice gpu_device(&stream);
147
148 const int num_rows = internal::random<int>(1024, 5*1024);
149 const int num_cols = internal::random<int>(1024, 5*1024);
150
151 Tensor<std::complex<float>, 2> in(num_rows, num_cols);
152 in.setRandom();
153
154 Tensor<std::complex<float>, 0> full_redux;
155 full_redux = in.prod();
156
157 std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
158 std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
159 std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
160 std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
161 gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
162
163 TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
164 TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
165
166 out_gpu.device(gpu_device) = in_gpu.prod();
167
168 Tensor<std::complex<float>, 0> full_redux_gpu;
169 gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
170 gpu_device.synchronize();
171
172 // Check that the CPU and GPU reductions return the same result.
173 VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
174
175 gpu_device.deallocate(gpu_in_ptr);
176 gpu_device.deallocate(gpu_out_ptr);
177 }
178
179
EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)180 EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
181 {
182 CALL_SUBTEST(test_cuda_nullary());
183 CALL_SUBTEST(test_cuda_sum_reductions());
184 CALL_SUBTEST(test_cuda_mean_reductions());
185 CALL_SUBTEST(test_cuda_product_reductions());
186 }
187