• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli    Codeplay Software Ltd.
6 // Ralph Potter  Codeplay Software Ltd.
7 // Luke Iwanski  Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 
14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
16 
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
19 
20 #include <iostream>
21 #include <chrono>
22 #include <ctime>
23 
24 #include "main.h"
25 #include <unsupported/Eigen/CXX11/Tensor>
26 #include <iomanip>
27 
28 using Eigen::array;
29 using Eigen::SyclDevice;
30 using Eigen::Tensor;
31 using Eigen::TensorMap;
32 static const float error_threshold =1e-4f;
33 
34 
35 template <typename DataType, int DataLayout, typename IndexType>
test_larg_expr1D(const Eigen::SyclDevice & sycl_device)36 static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
37 {
38   IndexType indim0 =53;
39   IndexType indim1= 55;
40   IndexType indim2= 51;
41   IndexType outdim0=50;
42   IndexType outdim1=55;
43   IndexType outdim2=51;
44   Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
45   Eigen::array<IndexType, 1> kernel_dims = {{4}};
46   Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
47 
48   Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
49   Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
50   Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
51   Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
52 
53   Eigen::array<IndexType, 1> dims3{{0}};
54 
55   input.setRandom();
56   kernel.setRandom();
57   result.setZero();
58   result_host.setZero();
59 
60   std::size_t input_bytes = input.size()  * sizeof(DataType);
61   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
62   std::size_t result_bytes = result.size() * sizeof(DataType);
63 
64   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
65   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
66   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
67 
68   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
69   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
70   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
71   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
72   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
73 
74   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
75   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
76 
77   result_host=input.convolve(kernel, dims3);
78 
79 for(IndexType i=0; i< outdim0; i++ ){
80   for(IndexType j=0; j< outdim1; j++ ){
81     for(IndexType k=0; k< outdim2; k++ ){
82       if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
83         std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
84         assert(false);
85       }
86     }
87   }
88 }
89   sycl_device.deallocate(d_input);
90   sycl_device.deallocate(d_kernel);
91   sycl_device.deallocate(d_result);
92 
93 }
94 
95 
96 template <typename DataType, int DataLayout, typename IndexType>
test_larg_expr2D(const Eigen::SyclDevice & sycl_device)97 static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
98 {
99   IndexType indim0 =53;
100   IndexType indim1= 55;
101   IndexType indim2= 51;
102   IndexType outdim0=50;
103   IndexType outdim1=51;
104   IndexType outdim2=51;
105   Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
106   Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
107   Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
108 
109   Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
110   Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
111   Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
112   Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
113 
114   Eigen::array<IndexType, 2> dims3{{0,1}};
115 
116   input.setRandom();
117   kernel.setRandom();
118   result.setZero();
119   result_host.setZero();
120 
121   std::size_t input_bytes = input.size()  * sizeof(DataType);
122   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
123   std::size_t result_bytes = result.size() * sizeof(DataType);
124 
125   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
126   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
127   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
128 
129   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
130   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
131   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
132   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
133   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
134 
135   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
136   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
137 
138   result_host=input.convolve(kernel, dims3);
139 
140 for(IndexType i=0; i< outdim0; i++ ){
141   for(IndexType j=0; j< outdim1; j++ ){
142     for(IndexType k=0; k< outdim2; k++ ){
143       if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
144         std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
145         assert(false);
146       }
147     }
148   }
149 }
150   sycl_device.deallocate(d_input);
151   sycl_device.deallocate(d_kernel);
152   sycl_device.deallocate(d_result);
153 
154 }
155 
156 
157 template <typename DataType, int DataLayout, typename IndexType>
test_larg_expr3D(const Eigen::SyclDevice & sycl_device)158 static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
159 {
160   IndexType indim0 =53;
161   IndexType indim1= 55;
162   IndexType indim2= 51;
163   IndexType outdim0=50;
164   IndexType outdim1=51;
165   IndexType outdim2=49;
166   Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
167   Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
168   Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
169 
170   Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
171   Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
172   Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
173   Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
174 
175   Eigen::array<IndexType, 3> dims3{{0,1,2}};
176 
177   input.setRandom();
178   kernel.setRandom();
179   result.setZero();
180   result_host.setZero();
181 
182   std::size_t input_bytes = input.size()  * sizeof(DataType);
183   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
184   std::size_t result_bytes = result.size() * sizeof(DataType);
185 
186   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
187   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
188   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
189 
190   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
191   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
192   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
193   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
194   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
195 
196   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
197   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
198 
199   result_host=input.convolve(kernel, dims3);
200 
201 for(IndexType i=0; i< outdim0; i++ ){
202   for(IndexType j=0; j< outdim1; j++ ){
203     for(IndexType k=0; k< outdim2; k++ ){
204       if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
205         std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
206         assert(false);
207       }
208     }
209   }
210 }
211   sycl_device.deallocate(d_input);
212   sycl_device.deallocate(d_kernel);
213   sycl_device.deallocate(d_result);
214 
215 }
216 
217 
218 template <typename DataType, int DataLayout, typename IndexType>
test_evals(const Eigen::SyclDevice & sycl_device)219 static void test_evals(const Eigen::SyclDevice& sycl_device)
220 {
221   Eigen::array<IndexType, 2> input_dims = {{3, 3}};
222   Eigen::array<IndexType, 1> kernel_dims = {{2}};
223   Eigen::array<IndexType, 2> result_dims = {{2, 3}};
224 
225   Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
226   Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
227   Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
228 
229   Eigen::array<IndexType, 1> dims3{{0}};
230 
231   input.setRandom();
232   kernel.setRandom();
233   result.setZero();
234 
235   std::size_t input_bytes = input.size()  * sizeof(DataType);
236   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
237   std::size_t result_bytes = result.size() * sizeof(DataType);
238 
239   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
240   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
241   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
242 
243   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
244   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
245   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
246   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
247   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
248 
249   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
250   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
251 
252   VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
253   VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
254   VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
255   VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
256   VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
257   VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
258 
259   sycl_device.deallocate(d_input);
260   sycl_device.deallocate(d_kernel);
261   sycl_device.deallocate(d_result);
262 }
263 
264 template <typename DataType, int DataLayout, typename IndexType>
test_expr(const Eigen::SyclDevice & sycl_device)265 static void test_expr(const Eigen::SyclDevice& sycl_device)
266 {
267   Eigen::array<IndexType, 2> input_dims = {{3, 3}};
268   Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
269   Eigen::array<IndexType, 2> result_dims = {{2, 2}};
270 
271   Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
272   Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
273   Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
274 
275   input.setRandom();
276   kernel.setRandom();
277   Eigen::array<IndexType, 2> dims;
278   dims[0] = 0;
279   dims[1] = 1;
280 
281   std::size_t input_bytes = input.size()  * sizeof(DataType);
282   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
283   std::size_t result_bytes = result.size() * sizeof(DataType);
284 
285   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
286   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
287   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
288 
289   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
290   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
291   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
292   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
293   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
294 
295   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
296   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
297 
298   VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
299                                 input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
300   VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
301                                 input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
302   VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
303                                 input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
304   VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
305                                 input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
306 
307   sycl_device.deallocate(d_input);
308   sycl_device.deallocate(d_kernel);
309   sycl_device.deallocate(d_result);
310 }
311 
312 
313 template <typename DataType, int DataLayout, typename IndexType>
test_modes(const Eigen::SyclDevice & sycl_device)314 static void test_modes(const Eigen::SyclDevice& sycl_device){
315 
316 Eigen::array<IndexType, 1> input_dims = {{3}};
317 Eigen::array<IndexType, 1> kernel_dims = {{3}};
318 
319 Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
320 Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
321 
322 input.setRandom();
323 kernel.setRandom();
324 Eigen::array<IndexType, 1> dims;
325 dims[0] = 0;
326 
327   input(0) = 1.0f;
328   input(1) = 2.0f;
329   input(2) = 3.0f;
330   kernel(0) = 0.5f;
331   kernel(1) = 1.0f;
332   kernel(2) = 0.0f;
333 
334   Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
335 
336   // Emulate VALID mode (as defined in
337   // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
338   padding[0] = std::make_pair(0, 0);
339   Tensor<DataType, 1, DataLayout, IndexType> valid(1);
340 
341   std::size_t input_bytes = input.size()  * sizeof(DataType);
342   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
343   std::size_t valid_bytes = valid.size() * sizeof(DataType);
344 
345   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
346   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
347   DataType * d_valid =  static_cast<DataType*>(sycl_device.allocate(valid_bytes));
348 
349   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
350   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
351   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
352   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
353   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
354 
355   gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
356   sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
357 
358   VERIFY_IS_EQUAL(valid.dimension(0), 1);
359   VERIFY_IS_APPROX(valid(0), 2.5f);
360 
361   // Emulate SAME mode (as defined in
362   // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
363   padding[0] = std::make_pair(1, 1);
364   Tensor<DataType, 1, DataLayout, IndexType> same(3);
365   std::size_t same_bytes = same.size() * sizeof(DataType);
366   DataType * d_same =  static_cast<DataType*>(sycl_device.allocate(same_bytes));
367   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
368   gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
369   sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
370 
371   VERIFY_IS_EQUAL(same.dimension(0), 3);
372   VERIFY_IS_APPROX(same(0), 1.0f);
373   VERIFY_IS_APPROX(same(1), 2.5f);
374   VERIFY_IS_APPROX(same(2), 4.0f);
375 
376   // Emulate FULL mode (as defined in
377   // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
378   padding[0] = std::make_pair(2, 2);
379 
380   Tensor<DataType, 1, DataLayout, IndexType> full(5);
381   std::size_t full_bytes = full.size() * sizeof(DataType);
382   DataType * d_full =  static_cast<DataType*>(sycl_device.allocate(full_bytes));
383   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
384   gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
385   sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
386 
387   VERIFY_IS_EQUAL(full.dimension(0), 5);
388   VERIFY_IS_APPROX(full(0), 0.0f);
389   VERIFY_IS_APPROX(full(1), 1.0f);
390   VERIFY_IS_APPROX(full(2), 2.5f);
391   VERIFY_IS_APPROX(full(3), 4.0f);
392   VERIFY_IS_APPROX(full(4), 1.5f);
393 
394   sycl_device.deallocate(d_input);
395   sycl_device.deallocate(d_kernel);
396   sycl_device.deallocate(d_valid);
397   sycl_device.deallocate(d_same);
398   sycl_device.deallocate(d_full);
399 
400 }
401 
402 template <typename DataType, int DataLayout, typename IndexType>
test_strides(const Eigen::SyclDevice & sycl_device)403 static void test_strides(const Eigen::SyclDevice& sycl_device){
404 
405   Eigen::array<IndexType, 1> input_dims = {{13}};
406   Eigen::array<IndexType, 1> kernel_dims = {{3}};
407 
408   Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
409   Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
410   Tensor<DataType, 1, DataLayout, IndexType> result(2);
411 
412   input.setRandom();
413   kernel.setRandom();
414   Eigen::array<IndexType, 1> dims;
415   dims[0] = 0;
416 
417   Eigen::array<IndexType, 1> stride_of_3;
418   stride_of_3[0] = 3;
419   Eigen::array<IndexType, 1> stride_of_2;
420   stride_of_2[0] = 2;
421 
422   std::size_t input_bytes = input.size()  * sizeof(DataType);
423   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
424   std::size_t result_bytes = result.size() * sizeof(DataType);
425 
426   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
427   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
428   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
429 
430   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
431   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
432   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
433   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
434   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
435 
436   gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
437   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
438 
439   VERIFY_IS_EQUAL(result.dimension(0), 2);
440   VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
441                                input(6)*kernel(2)));
442   VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
443                                input(12)*kernel(2)));
444 }
445 
tensorConvolutionPerDevice(Dev_selector & s)446 template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
447   QueueInterface queueInterface(s);
448   auto sycl_device=Eigen::SyclDevice(&queueInterface);
449   test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
450   test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
451   test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
452   test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
453   test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
454   test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
455   test_evals<float, ColMajor, int64_t>(sycl_device);
456   test_evals<float, RowMajor, int64_t>(sycl_device);
457   test_expr<float, ColMajor, int64_t>(sycl_device);
458   test_expr<float, RowMajor, int64_t>(sycl_device);
459   test_modes<float, ColMajor, int64_t>(sycl_device);
460   test_modes<float, RowMajor, int64_t>(sycl_device);
461   test_strides<float, ColMajor, int64_t>(sycl_device);
462   test_strides<float, RowMajor, int64_t>(sycl_device);
463 }
464 
EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl)465 EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
466   for (const auto& device :Eigen::get_sycl_supported_devices()) {
467     CALL_SUBTEST(tensorConvolutionPerDevice(device));
468   }
469 }
470