1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 
13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
14 #define EIGEN_USE_GPU
15 
16 #include "main.h"
17 #include <unsupported/Eigen/CXX11/Tensor>
18 
19 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
20 
21 using Eigen::Tensor;
22 using Eigen::RowMajor;
23 
24 // Context for evaluation on cpu
25 struct CPUContext {
CPUContextCPUContext26   CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
27     kernel_1d_(0) = 3.14f;
28     kernel_1d_(1) = 2.7f;
29 
30     kernel_2d_(0,0) = 3.14f;
31     kernel_2d_(1,0) = 2.7f;
32     kernel_2d_(0,1) = 0.2f;
33     kernel_2d_(1,1) = 7.0f;
34 
35     kernel_3d_(0,0,0) = 3.14f;
36     kernel_3d_(0,1,0) = 2.7f;
37     kernel_3d_(0,0,1) = 0.2f;
38     kernel_3d_(0,1,1) = 7.0f;
39     kernel_3d_(1,0,0) = -1.0f;
40     kernel_3d_(1,1,0) = -0.3f;
41     kernel_3d_(1,0,1) = -0.7f;
42     kernel_3d_(1,1,1) = -0.5f;
43   }
44 
deviceCPUContext45   const Eigen::DefaultDevice& device() const { return cpu_device_; }
46 
in1CPUContext47   const Eigen::Tensor<float, 3>& in1() const { return in1_; }
in2CPUContext48   const Eigen::Tensor<float, 3>& in2() const { return in2_; }
outCPUContext49   Eigen::Tensor<float, 3>& out() { return out_; }
kernel1dCPUContext50   const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
kernel2dCPUContext51   const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
kernel3dCPUContext52   const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
53 
54  private:
55   const Eigen::Tensor<float, 3>& in1_;
56   const Eigen::Tensor<float, 3>& in2_;
57   Eigen::Tensor<float, 3>& out_;
58 
59   Eigen::Tensor<float, 1> kernel_1d_;
60   Eigen::Tensor<float, 2> kernel_2d_;
61   Eigen::Tensor<float, 3> kernel_3d_;
62 
63   Eigen::DefaultDevice cpu_device_;
64 };
65 
66 
67 // Context for evaluation on GPU
68 struct GPUContext {
GPUContextGPUContext69   GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
70     assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
71     float kernel_1d_val[] = {3.14f, 2.7f};
72     assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
73 
74     assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
75     float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
76     assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
77 
78     assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
79     float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
80     assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
81   }
~GPUContextGPUContext82   ~GPUContext() {
83     assert(gpuFree(kernel_1d_) == gpuSuccess);
84     assert(gpuFree(kernel_2d_) == gpuSuccess);
85     assert(gpuFree(kernel_3d_) == gpuSuccess);
86   }
87 
deviceGPUContext88   const Eigen::GpuDevice& device() const { return gpu_device_; }
89 
in1GPUContext90   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
in2GPUContext91   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
outGPUContext92   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
kernel1dGPUContext93   Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
kernel2dGPUContext94   Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
kernel3dGPUContext95   Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
96 
97  private:
98   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
99   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
100   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
101 
102   float* kernel_1d_;
103   float* kernel_2d_;
104   float* kernel_3d_;
105 
106   Eigen::GpuStreamDevice stream_;
107   Eigen::GpuDevice gpu_device_;
108 };
109 
110 
111 // The actual expression to evaluate
112 template <typename Context>
test_contextual_eval(Context * context)113 void test_contextual_eval(Context* context)
114 {
115   context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
116 }
117 
118 template <typename Context>
test_forced_contextual_eval(Context * context)119 void test_forced_contextual_eval(Context* context)
120 {
121   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
122 }
123 
124 template <typename Context>
test_compound_assignment(Context * context)125 void test_compound_assignment(Context* context)
126 {
127   context->out().device(context->device()) = context->in1().constant(2.718f);
128   context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
129 }
130 
131 
132 template <typename Context>
test_contraction(Context * context)133 void test_contraction(Context* context)
134 {
135   Eigen::array<std::pair<int, int>, 2> dims;
136   dims[0] = std::make_pair(1, 1);
137   dims[1] = std::make_pair(2, 2);
138 
139   Eigen::array<int, 2> shape(40, 50*70);
140 
141   Eigen::DSizes<int, 2> indices(0,0);
142   Eigen::DSizes<int, 2> sizes(40,40);
143 
144   context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
145 }
146 
147 
148 template <typename Context>
test_1d_convolution(Context * context)149 void test_1d_convolution(Context* context)
150 {
151   Eigen::DSizes<int, 3> indices(0,0,0);
152   Eigen::DSizes<int, 3> sizes(40,49,70);
153 
154   Eigen::array<int, 1> dims(1);
155   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
156 }
157 
158 template <typename Context>
test_2d_convolution(Context * context)159 void test_2d_convolution(Context* context)
160 {
161   Eigen::DSizes<int, 3> indices(0,0,0);
162   Eigen::DSizes<int, 3> sizes(40,49,69);
163 
164   Eigen::array<int, 2> dims(1,2);
165   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
166 }
167 
168 template <typename Context>
test_3d_convolution(Context * context)169 void test_3d_convolution(Context* context)
170 {
171   Eigen::DSizes<int, 3> indices(0,0,0);
172   Eigen::DSizes<int, 3> sizes(39,49,69);
173 
174   Eigen::array<int, 3> dims(0,1,2);
175   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
176 }
177 
178 
test_cpu()179 void test_cpu() {
180   Eigen::Tensor<float, 3> in1(40,50,70);
181   Eigen::Tensor<float, 3> in2(40,50,70);
182   Eigen::Tensor<float, 3> out(40,50,70);
183 
184   in1 = in1.random() + in1.constant(10.0f);
185   in2 = in2.random() + in2.constant(10.0f);
186 
187   CPUContext context(in1, in2, out);
188   test_contextual_eval(&context);
189   for (int i = 0; i < 40; ++i) {
190     for (int j = 0; j < 50; ++j) {
191       for (int k = 0; k < 70; ++k) {
192         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
193       }
194     }
195   }
196 
197   test_forced_contextual_eval(&context);
198   for (int i = 0; i < 40; ++i) {
199     for (int j = 0; j < 50; ++j) {
200       for (int k = 0; k < 70; ++k) {
201         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
202       }
203     }
204   }
205 
206   test_compound_assignment(&context);
207   for (int i = 0; i < 40; ++i) {
208     for (int j = 0; j < 50; ++j) {
209       for (int k = 0; k < 70; ++k) {
210         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
211       }
212     }
213   }
214 
215   test_contraction(&context);
216   for (int i = 0; i < 40; ++i) {
217     for (int j = 0; j < 40; ++j) {
218       const float result = out(i,j,0);
219       float expected = 0;
220       for (int k = 0; k < 50; ++k) {
221         for (int l = 0; l < 70; ++l) {
222           expected += in1(i, k, l) * in2(j, k, l);
223         }
224       }
225       VERIFY_IS_APPROX(expected, result);
226     }
227   }
228 
229   test_1d_convolution(&context);
230   for (int i = 0; i < 40; ++i) {
231     for (int j = 0; j < 49; ++j) {
232       for (int k = 0; k < 70; ++k) {
233         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
234       }
235     }
236   }
237 
238   test_2d_convolution(&context);
239   for (int i = 0; i < 40; ++i) {
240     for (int j = 0; j < 49; ++j) {
241       for (int k = 0; k < 69; ++k) {
242         const float result = out(i,j,k);
243         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
244                                (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
245         if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
246           continue;
247         }
248         VERIFY_IS_APPROX(expected, result);
249       }
250     }
251   }
252 
253   test_3d_convolution(&context);
254   for (int i = 0; i < 39; ++i) {
255     for (int j = 0; j < 49; ++j) {
256       for (int k = 0; k < 69; ++k) {
257         const float result = out(i,j,k);
258         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
259                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
260                                (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
261                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
262         if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
263           continue;
264         }
265         VERIFY_IS_APPROX(expected, result);
266       }
267     }
268   }
269 }
270 
test_gpu()271 void test_gpu() {
272   Eigen::Tensor<float, 3> in1(40,50,70);
273   Eigen::Tensor<float, 3> in2(40,50,70);
274   Eigen::Tensor<float, 3> out(40,50,70);
275   in1 = in1.random() + in1.constant(10.0f);
276   in2 = in2.random() + in2.constant(10.0f);
277 
278   std::size_t in1_bytes = in1.size() * sizeof(float);
279   std::size_t in2_bytes = in2.size() * sizeof(float);
280   std::size_t out_bytes = out.size() * sizeof(float);
281 
282   float* d_in1;
283   float* d_in2;
284   float* d_out;
285   gpuMalloc((void**)(&d_in1), in1_bytes);
286   gpuMalloc((void**)(&d_in2), in2_bytes);
287   gpuMalloc((void**)(&d_out), out_bytes);
288 
289   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
290   gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
291 
292   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
293   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
294   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
295 
296   GPUContext context(gpu_in1, gpu_in2, gpu_out);
297   test_contextual_eval(&context);
298   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
299   for (int i = 0; i < 40; ++i) {
300     for (int j = 0; j < 50; ++j) {
301       for (int k = 0; k < 70; ++k) {
302         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
303       }
304     }
305   }
306 
307   test_forced_contextual_eval(&context);
308   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
309   for (int i = 0; i < 40; ++i) {
310     for (int j = 0; j < 50; ++j) {
311       for (int k = 0; k < 70; ++k) {
312         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
313       }
314     }
315   }
316 
317   test_compound_assignment(&context);
318   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
319   for (int i = 0; i < 40; ++i) {
320     for (int j = 0; j < 50; ++j) {
321       for (int k = 0; k < 70; ++k) {
322         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
323       }
324     }
325   }
326 
327   test_contraction(&context);
328   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
329   for (int i = 0; i < 40; ++i) {
330     for (int j = 0; j < 40; ++j) {
331       const float result = out(i,j,0);
332       float expected = 0;
333       for (int k = 0; k < 50; ++k) {
334         for (int l = 0; l < 70; ++l) {
335           expected += in1(i, k, l) * in2(j, k, l);
336         }
337       }
338       VERIFY_IS_APPROX(expected, result);
339     }
340   }
341 
342   test_1d_convolution(&context);
343   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
344   assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
345   for (int i = 0; i < 40; ++i) {
346     for (int j = 0; j < 49; ++j) {
347       for (int k = 0; k < 70; ++k) {
348         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
349       }
350     }
351   }
352 
353   test_2d_convolution(&context);
354   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
355   assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
356   for (int i = 0; i < 40; ++i) {
357     for (int j = 0; j < 49; ++j) {
358       for (int k = 0; k < 69; ++k) {
359         const float result = out(i,j,k);
360         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
361                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
362         VERIFY_IS_APPROX(expected, result);
363       }
364     }
365   }
366 
367 #if !defined(EIGEN_USE_HIP)
368 // disable this test on the HIP platform
369 // 3D tensor convolutions seem to hang on the HIP platform
370 
371   test_3d_convolution(&context);
372   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
373   assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
374   for (int i = 0; i < 39; ++i) {
375     for (int j = 0; j < 49; ++j) {
376       for (int k = 0; k < 69; ++k) {
377        const float result = out(i,j,k);
378         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
379                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
380                                 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
381                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
382         VERIFY_IS_APPROX(expected, result);
383       }
384     }
385   }
386 
387 #endif
388 
389 }
390 
391 
EIGEN_DECLARE_TEST(cxx11_tensor_device)392 EIGEN_DECLARE_TEST(cxx11_tensor_device)
393 {
394   CALL_SUBTEST_1(test_cpu());
395   CALL_SUBTEST_2(test_gpu());
396 }
397