• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 
12 #define EIGEN_USE_GPU
13 
14 #include "main.h"
15 #include <unsupported/Eigen/CXX11/Tensor>
16 
17 using Eigen::Tensor;
18 
test_cuda_nullary()19 void test_cuda_nullary() {
20   Tensor<std::complex<float>, 1, 0, int> in1(2);
21   Tensor<std::complex<float>, 1, 0, int> in2(2);
22   in1.setRandom();
23   in2.setRandom();
24 
25   std::size_t float_bytes = in1.size() * sizeof(float);
26   std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
27 
28   std::complex<float>* d_in1;
29   std::complex<float>* d_in2;
30   float* d_out2;
31   cudaMalloc((void**)(&d_in1), complex_bytes);
32   cudaMalloc((void**)(&d_in2), complex_bytes);
33   cudaMalloc((void**)(&d_out2), float_bytes);
34   cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
35   cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
36 
37   Eigen::GpuStreamDevice stream;
38   Eigen::GpuDevice gpu_device(&stream);
39 
40   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
41       d_in1, 2);
42   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
43       d_in2, 2);
44   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
45       d_out2, 2);
46 
47   gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
48   gpu_out2.device(gpu_device) = gpu_in2.abs();
49 
50   Tensor<std::complex<float>, 1, 0, int> new1(2);
51   Tensor<float, 1, 0, int> new2(2);
52 
53   assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
54                          gpu_device.stream()) == cudaSuccess);
55   assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
56                          gpu_device.stream()) == cudaSuccess);
57 
58   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
59 
60   for (int i = 0; i < 2; ++i) {
61     VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
62     VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
63   }
64 
65   cudaFree(d_in1);
66   cudaFree(d_in2);
67   cudaFree(d_out2);
68 }
69 
70 
test_cuda_sum_reductions()71 static void test_cuda_sum_reductions() {
72 
73   Eigen::GpuStreamDevice stream;
74   Eigen::GpuDevice gpu_device(&stream);
75 
76   const int num_rows = internal::random<int>(1024, 5*1024);
77   const int num_cols = internal::random<int>(1024, 5*1024);
78 
79   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
80   in.setRandom();
81 
82   Tensor<std::complex<float>, 0> full_redux;
83   full_redux = in.sum();
84 
85   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
86   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
87   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
88   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
89   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
90 
91   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
92   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
93 
94   out_gpu.device(gpu_device) = in_gpu.sum();
95 
96   Tensor<std::complex<float>, 0> full_redux_gpu;
97   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
98   gpu_device.synchronize();
99 
100   // Check that the CPU and GPU reductions return the same result.
101   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
102 
103   gpu_device.deallocate(gpu_in_ptr);
104   gpu_device.deallocate(gpu_out_ptr);
105 }
106 
test_cuda_mean_reductions()107 static void test_cuda_mean_reductions() {
108 
109   Eigen::GpuStreamDevice stream;
110   Eigen::GpuDevice gpu_device(&stream);
111 
112   const int num_rows = internal::random<int>(1024, 5*1024);
113   const int num_cols = internal::random<int>(1024, 5*1024);
114 
115   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
116   in.setRandom();
117 
118   Tensor<std::complex<float>, 0> full_redux;
119   full_redux = in.mean();
120 
121   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
122   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
123   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
124   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
125   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
126 
127   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
128   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
129 
130   out_gpu.device(gpu_device) = in_gpu.mean();
131 
132   Tensor<std::complex<float>, 0> full_redux_gpu;
133   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
134   gpu_device.synchronize();
135 
136   // Check that the CPU and GPU reductions return the same result.
137   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
138 
139   gpu_device.deallocate(gpu_in_ptr);
140   gpu_device.deallocate(gpu_out_ptr);
141 }
142 
test_cuda_product_reductions()143 static void test_cuda_product_reductions() {
144 
145   Eigen::GpuStreamDevice stream;
146   Eigen::GpuDevice gpu_device(&stream);
147 
148   const int num_rows = internal::random<int>(1024, 5*1024);
149   const int num_cols = internal::random<int>(1024, 5*1024);
150 
151   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
152   in.setRandom();
153 
154   Tensor<std::complex<float>, 0> full_redux;
155   full_redux = in.prod();
156 
157   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
158   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
159   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
160   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
161   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
162 
163   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
164   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
165 
166   out_gpu.device(gpu_device) = in_gpu.prod();
167 
168   Tensor<std::complex<float>, 0> full_redux_gpu;
169   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
170   gpu_device.synchronize();
171 
172   // Check that the CPU and GPU reductions return the same result.
173   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
174 
175   gpu_device.deallocate(gpu_in_ptr);
176   gpu_device.deallocate(gpu_out_ptr);
177 }
178 
179 
EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)180 EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
181 {
182   CALL_SUBTEST(test_cuda_nullary());
183   CALL_SUBTEST(test_cuda_sum_reductions());
184   CALL_SUBTEST(test_cuda_mean_reductions());
185   CALL_SUBTEST(test_cuda_product_reductions());
186 }
187