1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_device
13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
14 #define EIGEN_USE_GPU
15
16 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
17 #include <cuda_fp16.h>
18 #endif
19 #include "main.h"
20 #include <unsupported/Eigen/CXX11/Tensor>
21
22 using Eigen::Tensor;
23 using Eigen::RowMajor;
24
25 // Context for evaluation on cpu
26 struct CPUContext {
CPUContextCPUContext27 CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
28 kernel_1d_(0) = 3.14f;
29 kernel_1d_(1) = 2.7f;
30
31 kernel_2d_(0,0) = 3.14f;
32 kernel_2d_(1,0) = 2.7f;
33 kernel_2d_(0,1) = 0.2f;
34 kernel_2d_(1,1) = 7.0f;
35
36 kernel_3d_(0,0,0) = 3.14f;
37 kernel_3d_(0,1,0) = 2.7f;
38 kernel_3d_(0,0,1) = 0.2f;
39 kernel_3d_(0,1,1) = 7.0f;
40 kernel_3d_(1,0,0) = -1.0f;
41 kernel_3d_(1,1,0) = -0.3f;
42 kernel_3d_(1,0,1) = -0.7f;
43 kernel_3d_(1,1,1) = -0.5f;
44 }
45
deviceCPUContext46 const Eigen::DefaultDevice& device() const { return cpu_device_; }
47
in1CPUContext48 const Eigen::Tensor<float, 3>& in1() const { return in1_; }
in2CPUContext49 const Eigen::Tensor<float, 3>& in2() const { return in2_; }
outCPUContext50 Eigen::Tensor<float, 3>& out() { return out_; }
kernel1dCPUContext51 const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
kernel2dCPUContext52 const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
kernel3dCPUContext53 const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
54
55 private:
56 const Eigen::Tensor<float, 3>& in1_;
57 const Eigen::Tensor<float, 3>& in2_;
58 Eigen::Tensor<float, 3>& out_;
59
60 Eigen::Tensor<float, 1> kernel_1d_;
61 Eigen::Tensor<float, 2> kernel_2d_;
62 Eigen::Tensor<float, 3> kernel_3d_;
63
64 Eigen::DefaultDevice cpu_device_;
65 };
66
67
68 // Context for evaluation on GPU
69 struct GPUContext {
GPUContextGPUContext70 GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
71 assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
72 float kernel_1d_val[] = {3.14f, 2.7f};
73 assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
74
75 assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
76 float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
77 assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
78
79 assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
80 float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
81 assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
82 }
~GPUContextGPUContext83 ~GPUContext() {
84 assert(cudaFree(kernel_1d_) == cudaSuccess);
85 assert(cudaFree(kernel_2d_) == cudaSuccess);
86 assert(cudaFree(kernel_3d_) == cudaSuccess);
87 }
88
deviceGPUContext89 const Eigen::GpuDevice& device() const { return gpu_device_; }
90
in1GPUContext91 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
in2GPUContext92 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
outGPUContext93 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
kernel1dGPUContext94 Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
kernel2dGPUContext95 Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
kernel3dGPUContext96 Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
97
98 private:
99 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
100 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
101 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
102
103 float* kernel_1d_;
104 float* kernel_2d_;
105 float* kernel_3d_;
106
107 Eigen::CudaStreamDevice stream_;
108 Eigen::GpuDevice gpu_device_;
109 };
110
111
112 // The actual expression to evaluate
113 template <typename Context>
test_contextual_eval(Context * context)114 void test_contextual_eval(Context* context)
115 {
116 context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
117 }
118
119 template <typename Context>
test_forced_contextual_eval(Context * context)120 void test_forced_contextual_eval(Context* context)
121 {
122 context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
123 }
124
125 template <typename Context>
test_compound_assignment(Context * context)126 void test_compound_assignment(Context* context)
127 {
128 context->out().device(context->device()) = context->in1().constant(2.718f);
129 context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
130 }
131
132
133 template <typename Context>
test_contraction(Context * context)134 void test_contraction(Context* context)
135 {
136 Eigen::array<std::pair<int, int>, 2> dims;
137 dims[0] = std::make_pair(1, 1);
138 dims[1] = std::make_pair(2, 2);
139
140 Eigen::array<int, 2> shape(40, 50*70);
141
142 Eigen::DSizes<int, 2> indices(0,0);
143 Eigen::DSizes<int, 2> sizes(40,40);
144
145 context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
146 }
147
148
149 template <typename Context>
test_1d_convolution(Context * context)150 void test_1d_convolution(Context* context)
151 {
152 Eigen::DSizes<int, 3> indices(0,0,0);
153 Eigen::DSizes<int, 3> sizes(40,49,70);
154
155 Eigen::array<int, 1> dims(1);
156 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
157 }
158
159 template <typename Context>
test_2d_convolution(Context * context)160 void test_2d_convolution(Context* context)
161 {
162 Eigen::DSizes<int, 3> indices(0,0,0);
163 Eigen::DSizes<int, 3> sizes(40,49,69);
164
165 Eigen::array<int, 2> dims(1,2);
166 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
167 }
168
169 template <typename Context>
test_3d_convolution(Context * context)170 void test_3d_convolution(Context* context)
171 {
172 Eigen::DSizes<int, 3> indices(0,0,0);
173 Eigen::DSizes<int, 3> sizes(39,49,69);
174
175 Eigen::array<int, 3> dims(0,1,2);
176 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
177 }
178
179
test_cpu()180 void test_cpu() {
181 Eigen::Tensor<float, 3> in1(40,50,70);
182 Eigen::Tensor<float, 3> in2(40,50,70);
183 Eigen::Tensor<float, 3> out(40,50,70);
184
185 in1 = in1.random() + in1.constant(10.0f);
186 in2 = in2.random() + in2.constant(10.0f);
187
188 CPUContext context(in1, in2, out);
189 test_contextual_eval(&context);
190 for (int i = 0; i < 40; ++i) {
191 for (int j = 0; j < 50; ++j) {
192 for (int k = 0; k < 70; ++k) {
193 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
194 }
195 }
196 }
197
198 test_forced_contextual_eval(&context);
199 for (int i = 0; i < 40; ++i) {
200 for (int j = 0; j < 50; ++j) {
201 for (int k = 0; k < 70; ++k) {
202 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
203 }
204 }
205 }
206
207 test_compound_assignment(&context);
208 for (int i = 0; i < 40; ++i) {
209 for (int j = 0; j < 50; ++j) {
210 for (int k = 0; k < 70; ++k) {
211 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
212 }
213 }
214 }
215
216 test_contraction(&context);
217 for (int i = 0; i < 40; ++i) {
218 for (int j = 0; j < 40; ++j) {
219 const float result = out(i,j,0);
220 float expected = 0;
221 for (int k = 0; k < 50; ++k) {
222 for (int l = 0; l < 70; ++l) {
223 expected += in1(i, k, l) * in2(j, k, l);
224 }
225 }
226 VERIFY_IS_APPROX(expected, result);
227 }
228 }
229
230 test_1d_convolution(&context);
231 for (int i = 0; i < 40; ++i) {
232 for (int j = 0; j < 49; ++j) {
233 for (int k = 0; k < 70; ++k) {
234 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
235 }
236 }
237 }
238
239 test_2d_convolution(&context);
240 for (int i = 0; i < 40; ++i) {
241 for (int j = 0; j < 49; ++j) {
242 for (int k = 0; k < 69; ++k) {
243 const float result = out(i,j,k);
244 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
245 (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
246 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
247 continue;
248 }
249 VERIFY_IS_APPROX(expected, result);
250 }
251 }
252 }
253
254 test_3d_convolution(&context);
255 for (int i = 0; i < 39; ++i) {
256 for (int j = 0; j < 49; ++j) {
257 for (int k = 0; k < 69; ++k) {
258 const float result = out(i,j,k);
259 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
260 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
261 (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
262 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
263 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
264 continue;
265 }
266 VERIFY_IS_APPROX(expected, result);
267 }
268 }
269 }
270 }
271
test_gpu()272 void test_gpu() {
273 Eigen::Tensor<float, 3> in1(40,50,70);
274 Eigen::Tensor<float, 3> in2(40,50,70);
275 Eigen::Tensor<float, 3> out(40,50,70);
276 in1 = in1.random() + in1.constant(10.0f);
277 in2 = in2.random() + in2.constant(10.0f);
278
279 std::size_t in1_bytes = in1.size() * sizeof(float);
280 std::size_t in2_bytes = in2.size() * sizeof(float);
281 std::size_t out_bytes = out.size() * sizeof(float);
282
283 float* d_in1;
284 float* d_in2;
285 float* d_out;
286 cudaMalloc((void**)(&d_in1), in1_bytes);
287 cudaMalloc((void**)(&d_in2), in2_bytes);
288 cudaMalloc((void**)(&d_out), out_bytes);
289
290 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
291 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
292
293 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
294 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
295 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
296
297 GPUContext context(gpu_in1, gpu_in2, gpu_out);
298 test_contextual_eval(&context);
299 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
300 for (int i = 0; i < 40; ++i) {
301 for (int j = 0; j < 50; ++j) {
302 for (int k = 0; k < 70; ++k) {
303 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
304 }
305 }
306 }
307
308 test_forced_contextual_eval(&context);
309 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
310 for (int i = 0; i < 40; ++i) {
311 for (int j = 0; j < 50; ++j) {
312 for (int k = 0; k < 70; ++k) {
313 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
314 }
315 }
316 }
317
318 test_compound_assignment(&context);
319 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
320 for (int i = 0; i < 40; ++i) {
321 for (int j = 0; j < 50; ++j) {
322 for (int k = 0; k < 70; ++k) {
323 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
324 }
325 }
326 }
327
328 test_contraction(&context);
329 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
330 for (int i = 0; i < 40; ++i) {
331 for (int j = 0; j < 40; ++j) {
332 const float result = out(i,j,0);
333 float expected = 0;
334 for (int k = 0; k < 50; ++k) {
335 for (int l = 0; l < 70; ++l) {
336 expected += in1(i, k, l) * in2(j, k, l);
337 }
338 }
339 VERIFY_IS_APPROX(expected, result);
340 }
341 }
342
343 test_1d_convolution(&context);
344 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
345 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
346 for (int i = 0; i < 40; ++i) {
347 for (int j = 0; j < 49; ++j) {
348 for (int k = 0; k < 70; ++k) {
349 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
350 }
351 }
352 }
353
354 test_2d_convolution(&context);
355 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
356 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
357 for (int i = 0; i < 40; ++i) {
358 for (int j = 0; j < 49; ++j) {
359 for (int k = 0; k < 69; ++k) {
360 const float result = out(i,j,k);
361 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
362 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
363 VERIFY_IS_APPROX(expected, result);
364 }
365 }
366 }
367
368 test_3d_convolution(&context);
369 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
370 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
371 for (int i = 0; i < 39; ++i) {
372 for (int j = 0; j < 49; ++j) {
373 for (int k = 0; k < 69; ++k) {
374 const float result = out(i,j,k);
375 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
376 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
377 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
378 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
379 VERIFY_IS_APPROX(expected, result);
380 }
381 }
382 }
383 }
384
385
test_cxx11_tensor_device()386 void test_cxx11_tensor_device()
387 {
388 CALL_SUBTEST_1(test_cpu());
389 CALL_SUBTEST_2(test_gpu());
390 }
391