• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli    Codeplay Software Ltd.
6 // Ralph Potter  Codeplay Software Ltd.
7 // Luke Iwanski  Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 
14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
16 
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
19 
20 #include <algorithm>
21 #include <chrono>
22 #include <ctime>
23 #include <iostream>
24 
25 #include "main.h"
26 
27 #include <unsupported/Eigen/CXX11/Tensor>
28 
29 using Eigen::array;
30 using Eigen::SyclDevice;
31 using Eigen::Tensor;
32 using Eigen::TensorMap;
33 
34 template <int DataLayout, typename DataType, typename IndexType,
35           typename Device>
test_sycl_contraction(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)36 void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
37                                   IndexType k_size, IndexType n_size) {
38   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
39       DimPair;
40   static const DataType error_threshold = DataType(1e-4);
41   // with these dimensions, the output has 300 * 140 elements, which is
42   // more than 30 * 1024, which is the number of threads in blocks on
43   // a 15 SM GK110 GPU
44   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
45   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
46   Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
47   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
48   Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
49   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
50   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
51   Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
52 
53   t_left.setRandom();
54   t_right.setRandom();
55 
56   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
57   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
58   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
59 
60   DataType *d_t_left =
61       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
62   DataType *d_t_right =
63       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
64   DataType *d_t_result =
65       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
66 
67   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
68       gpu_t_left(d_t_left, left_dims);
69   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
70       gpu_t_right(d_t_right, right_dims);
71   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
72       gpu_t_result(d_t_result, result_dims);
73 
74   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
75   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
76 
77   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
78   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
79                                  t_result_bytes);
80 
81   t_result = t_left.contract(t_right, dims);
82 
83   for (IndexType i = 0; i < t_result.size(); i++) {
84     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
85             t_result(i) - t_result_gpu(i)))) < error_threshold) {
86       continue;
87     }
88     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
89                                   error_threshold)) {
90       continue;
91     }
92 
93     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
94               << ", mismatch detected at IndexType " << i << ": " << t_result(i)
95               << " vs " << t_result_gpu(i) << std::endl;
96     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
97   }
98   sycl_device.deallocate(d_t_left);
99   sycl_device.deallocate(d_t_right);
100   sycl_device.deallocate(d_t_result);
101 }
102 
103 template <int DataLayout, typename DataType, typename IndexType,
104           typename Device>
test_sycl_contraction_m(const Device & sycl_device)105 void test_sycl_contraction_m(const Device &sycl_device) {
106   for (IndexType k = 32; k < 256; k++) {
107     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
108                                                            128);
109   }
110 }
111 
112 template <int DataLayout, typename DataType, typename IndexType,
113           typename Device>
test_sycl_contraction_k(const Device & sycl_device)114 void test_sycl_contraction_k(const Device &sycl_device) {
115   for (IndexType k = 32; k < 256; k++) {
116     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
117                                                            128);
118   }
119 }
120 
121 template <int DataLayout, typename DataType, typename IndexType,
122           typename Device>
test_sycl_contraction_n(const Device & sycl_device)123 void test_sycl_contraction_n(const Device &sycl_device) {
124   for (IndexType k = 32; k < 256; k++) {
125     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
126                                                            128, k);
127   }
128 }
129 
130 template <int DataLayout, typename DataType, typename IndexType,
131           typename Device>
test_sycl_contraction_sizes(const Device & sycl_device)132 void test_sycl_contraction_sizes(const Device &sycl_device) {
133   IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
134                          257, 511, 512, 513, 1023, 1024, 1025};
135 
136   IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
137                          257, 511, 512, 513, 1023, 1024, 1025};
138 
139   IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
140                          255, 257, 511, 512, 513, 1023, 1024, 1025};
141 
142   for (IndexType i = 0; i < 15; i++) {
143     for (IndexType j = 0; j < 15; j++) {
144       for (IndexType k = 0; k < 17; k++) {
145         test_sycl_contraction<DataLayout, DataType, IndexType>(
146             sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
147       }
148     }
149   }
150 }
151 
152 template <int DataLayout, typename DataType, typename IndexType,
153           typename Device>
test_no_out_of_bounds(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)154 void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
155                                   IndexType k_size, IndexType n_size) {
156   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
157       DimPair;
158   static const DataType error_threshold = DataType(1e-4);
159   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
160   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
161   Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
162 
163   Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
164   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
165   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
166   Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
167 
168   t_left.setRandom();
169   t_right.setRandom();
170 
171   // Allocate buffers twice as big to check for invalid read and write
172   auto padded_left_size = 2 * t_left.size();
173   auto padded_right_size = 2 * t_right.size();
174   auto padded_result_size = 2 * t_result.size();
175 
176   std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
177   std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
178   std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
179 
180   DataType *d_t_left =
181       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
182   DataType *d_t_right =
183       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
184   DataType *d_t_result =
185       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
186 
187   // TensorMaps are still of the same size than the Tensors
188   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
189       gpu_t_left(d_t_left, left_dims);
190   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
191       gpu_t_right(d_t_right, right_dims);
192   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
193       gpu_t_result(d_t_result, result_dims);
194 
195   // Write nan after the actual buffer to propagate nans everywhere in case of
196   // invalid reads
197   DataType nan = std::numeric_limits<DataType>::quiet_NaN();
198   auto host_left_data = new DataType[padded_left_size];
199   std::copy_n(t_left.data(), t_left.size(), host_left_data);
200   std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
201   auto host_right_data = new DataType[padded_right_size];
202   std::copy_n(t_right.data(), t_right.size(), host_right_data);
203   std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
204   auto host_result_data = new DataType[padded_result_size];
205   std::fill_n(host_result_data, padded_result_size, nan);
206 
207   sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
208   sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
209   sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
210 
211   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
212   sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
213 
214   t_result = t_left.contract(t_right, dims);
215 
216   for (IndexType i = 0; i < t_result.size(); i++) {
217     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
218             t_result(i) - host_result_data[i]))) < error_threshold) {
219       continue;
220     }
221     if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
222                                   error_threshold)) {
223       continue;
224     }
225     if (std::isnan(host_result_data[i])) {
226       std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
227                 << ", invalid read detected at IndexType " << i << ": "
228                 << t_result(i) << " vs " << host_result_data[i] << std::endl;
229     } else {
230       std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
231                 << ", mismatch detected at IndexType " << i << ": "
232                 << t_result(i) << " vs " << host_result_data[i] << std::endl;
233     }
234     VERIFY_IS_APPROX(host_result_data[i], t_result(i));
235   }
236   // Make sure that the rest of the result is still nans
237   for (IndexType i = t_result.size(); i < padded_result_size; i++) {
238     if (std::isnan(host_result_data[i])) {
239       continue;
240     }
241     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
242               << ", invalid write detected at IndexType " << i << ": "
243               << host_result_data[i] << std::endl;
244     VERIFY_IS_APPROX(host_result_data[i], t_result(i));
245   }
246   sycl_device.deallocate(d_t_left);
247   sycl_device.deallocate(d_t_right);
248   sycl_device.deallocate(d_t_result);
249 
250   delete[] host_left_data;
251   delete[] host_right_data;
252   delete[] host_result_data;
253 }
254 
255 template <int DataLayout, typename DataType, typename IndexType,
256           typename Device>
test_scalar(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)257 void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
258                  IndexType n_size) {
259   // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
260   // ")" << std::endl;
261   // with these dimensions, the output has 300 * 140 elements, which is
262   // more than 30 * 1024, which is the number of threads in blocks on
263   // a 15 SM GK110 GPU
264   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
265       DimPair;
266   static const DataType error_threshold = DataType(1e-4);
267   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
268   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
269   Tensor<DataType, 0, DataLayout, IndexType> t_result;
270   Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
271   Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
272   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
273   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
274   t_left.setRandom();
275   t_right.setRandom();
276 
277   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
278   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
279   std::size_t t_result_bytes = sizeof(DataType);
280 
281   DataType *d_t_left =
282       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
283   DataType *d_t_right =
284       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
285   DataType *d_t_result =
286       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
287 
288   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
289       gpu_t_left(d_t_left, left_dims);
290   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
291       gpu_t_right(d_t_right, right_dims);
292   Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
293       gpu_t_result(d_t_result);
294 
295   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
296   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
297 
298   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
299   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
300                                  t_result_bytes);
301 
302   t_result = t_left.contract(t_right, dims);
303 
304   if (static_cast<DataType>(std::fabs(static_cast<DataType>(
305           t_result() - t_result_gpu()))) > error_threshold &&
306       !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
307     std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
308               << " : mismatch detected: " << t_result() << " vs "
309               << t_result_gpu() << std::endl;
310     VERIFY_IS_APPROX(t_result_gpu(), t_result());
311   }
312 
313   sycl_device.deallocate(d_t_left);
314   sycl_device.deallocate(d_t_right);
315   sycl_device.deallocate(d_t_result);
316 }
317 
318 template <int DataLayout, typename DataType, typename IndexType,
319           typename Device>
contraction_batch(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size,IndexType m_batch,IndexType start,IndexType limit)320 void contraction_batch(const Device &sycl_device, IndexType m_size,
321                        IndexType k_size, IndexType n_size, IndexType m_batch,
322                        IndexType start, IndexType limit) {
323   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
324       DimPair;
325   static const DataType error_threshold = DataType(1e-4);
326   typedef Eigen::array<IndexType, 3> TensorDim;
327   typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
328   TensorDim left_dims = {{m_batch, k_size, m_size}};
329   TensorDim right_dims = {{m_batch, n_size, k_size}};
330   TensorDim res_dims = {{m_batch, m_size, n_size}};
331   Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
332 
333   TensorType t_left(left_dims);
334   TensorType t_right(right_dims);
335   TensorType t_result_gpu(res_dims);
336   TensorType t_result(res_dims);
337 
338   t_left.setRandom();
339   t_right.setRandom();
340 
341   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
342   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
343   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
344 
345   DataType *d_t_left =
346       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
347   DataType *d_t_right =
348       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
349   DataType *d_t_result =
350       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
351 
352   Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
353   Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
354   Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
355 
356   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
357   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
358   for (int i = start; i < limit; ++i) {
359     auto x = gpu_t_left.template chip<0>(i);
360     auto y = gpu_t_right.template chip<0>(i);
361     auto z = gpu_t_result.template chip<0>(i);
362     z.device(sycl_device) = x.contract(y, contract_pairs);
363   }
364   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
365                                  t_result_bytes);
366 
367   for (int i = start; i < limit; ++i) {
368     auto x = t_left.template chip<0>(i);
369     auto y = t_right.template chip<0>(i);
370     auto z = t_result.template chip<0>(i);
371     z = x.contract(y, contract_pairs);
372   }
373 
374   for (IndexType i = 0; i < t_result.size(); i++) {
375     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
376             t_result(i) - t_result_gpu(i)))) < error_threshold) {
377       continue;
378     }
379     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
380                                   error_threshold)) {
381       continue;
382     }
383     std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
384               << " vs " << t_result_gpu(i) << std::endl;
385     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
386   }
387   sycl_device.deallocate(d_t_left);
388   sycl_device.deallocate(d_t_right);
389   sycl_device.deallocate(d_t_result);
390 }
391 
392 template <int DataLayout, typename DataType, typename IndexType,
393           typename Device>
contraction_rhs_transposed(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)394 void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
395                                 IndexType k_size, IndexType n_size) {
396   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
397       DimPair;
398   static const DataType error_threshold = DataType(1e-4);
399   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
400   Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
401   Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
402   Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
403 
404   Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
405   Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
406   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
407   Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
408 
409   t_left.setRandom();
410   t_right.setRandom();
411 
412   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
413   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
414   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
415 
416   DataType *d_t_left =
417       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
418   DataType *d_t_right =
419       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
420   DataType *d_t_result =
421       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
422 
423   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
424       gpu_t_left(d_t_left, left_dims);
425   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
426       gpu_t_right(d_t_right, right_dims);
427   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
428       gpu_t_result(d_t_result, res_dims);
429 
430   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
431   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
432 
433   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
434   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
435                                  t_result_bytes);
436 
437   t_result = t_left.contract(t_right, dims);
438 
439   for (IndexType j = 0; j < m_size; j++) {
440     for (IndexType i = 0; i < n_size; i++) {
441       if (static_cast<DataType>(std::fabs(static_cast<DataType>(
442               t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
443         continue;
444       }
445       if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
446                                     error_threshold)) {
447         continue;
448       }
449       std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
450                 << ", mismatch detected at IndexType m: " << j << " n: " << i
451                 << " CPU : " << t_result(j, i)
452                 << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
453       VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
454     }
455   }
456   sycl_device.deallocate(d_t_left);
457   sycl_device.deallocate(d_t_right);
458   sycl_device.deallocate(d_t_result);
459 }
460 
461 template <int DataLayout, typename DataType, typename IndexType,
462           typename Device>
contraction_lhs_transposed(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)463 void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
464                                 IndexType k_size, IndexType n_size) {
465   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
466       DimPair;
467   static const DataType error_threshold = DataType(1e-4);
468   Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
469   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
470   Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
471   Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
472 
473   Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
474   Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
475   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
476   Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
477 
478   t_left.setRandom();
479   t_right.setRandom();
480 
481   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
482   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
483   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
484 
485   DataType *d_t_left =
486       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
487   DataType *d_t_right =
488       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
489   DataType *d_t_result =
490       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
491 
492   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
493       gpu_t_left(d_t_left, left_dims);
494   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
495       gpu_t_right(d_t_right, right_dims);
496   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
497       gpu_t_result(d_t_result, res_dims);
498 
499   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
500   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
501 
502   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
503   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
504                                  t_result_bytes);
505 
506   t_result = t_left.contract(t_right, dims);
507 
508   for (IndexType i = 0; i < t_result.size(); i++) {
509     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
510             t_result(i) - t_result_gpu(i)))) < error_threshold) {
511       continue;
512     }
513     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
514                                   error_threshold)) {
515       continue;
516     }
517     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
518               << ", mismatch detected at IndexType " << i << ": " << t_result(i)
519               << " vs " << t_result_gpu(i) << std::endl;
520     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
521   }
522   sycl_device.deallocate(d_t_left);
523   sycl_device.deallocate(d_t_right);
524   sycl_device.deallocate(d_t_result);
525 }
526 
527 template <int DataLayout, typename DataType, typename IndexType,
528           typename Device>
contraction_both_transposed(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)529 void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
530                                  IndexType k_size, IndexType n_size) {
531   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
532       DimPair;
533   static const DataType error_threshold = DataType(1e-4);
534   Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
535   Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
536   Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
537   Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
538 
539   Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
540   Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
541   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
542   Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
543 
544   t_left.setRandom();
545   t_right.setRandom();
546 
547   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
548   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
549   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
550 
551   DataType *d_t_left =
552       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
553   DataType *d_t_right =
554       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
555   DataType *d_t_result =
556       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
557 
558   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
559       gpu_t_left(d_t_left, left_dims);
560   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
561       gpu_t_right(d_t_right, right_dims);
562   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
563       gpu_t_result(d_t_result, res_dims);
564 
565   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
566   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
567 
568   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
569   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
570                                  t_result_bytes);
571 
572   t_result = t_left.contract(t_right, dims);
573 
574   for (IndexType i = 0; i < t_result.size(); i++) {
575     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
576             t_result(i) - t_result_gpu(i)))) < error_threshold) {
577       continue;
578     }
579     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
580                                   error_threshold)) {
581       continue;
582     }
583     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
584               << ", mismatch detected at IndexType " << i << ": " << t_result(i)
585               << " vs " << t_result_gpu(i) << std::endl;
586 
587     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
588   }
589   sycl_device.deallocate(d_t_left);
590   sycl_device.deallocate(d_t_right);
591   sycl_device.deallocate(d_t_result);
592 }
593 
594 template <typename Dev>
tensorOutofBound(const Dev & sycl_device)595 void inline tensorOutofBound(const Dev &sycl_device) {
596   typedef float DataType;
597   typedef int64_t IndexType;
598   std::chrono::time_point<std::chrono::system_clock> start, end;
599   start = std::chrono::system_clock::now();
600   // Test out of bound for Tensor-Tensor
601   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
602                                                        1024);
603   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
604                                                        4096);
605   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
606                                                        2048);
607   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
608                                                        1024);
609   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
610                                                        784);
611   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
612                                                        10);
613   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
614                                                        513);
615   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
616                                                        783);
617   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
618                                                        784);
619   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
620                                                        11);
621   end = std::chrono::system_clock::now();
622   std::chrono::duration<double> elapsed_seconds = end - start;
623   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
624   std::cout << "tensor out of bound tests finished computation at "
625             << std::ctime(&end_time)
626             << "elapsed time: " << elapsed_seconds.count() << "s\n";
627 }
628 
629 template <typename Dev>
tensorTensor(const Dev & sycl_device)630 void inline tensorTensor(const Dev &sycl_device) {
631   typedef float DataType;
632   typedef int64_t IndexType;
633   std::chrono::time_point<std::chrono::system_clock> start, end;
634   start = std::chrono::system_clock::now();
635   // Tensor Tensor Contraction
636   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
637                                                        128);
638   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
639                                                        128);
640   end = std::chrono::system_clock::now();
641   std::chrono::duration<double> elapsed_seconds = end - start;
642   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
643   std::cout << "tensor tensor tests finished computation at "
644             << std::ctime(&end_time)
645             << "elapsed time: " << elapsed_seconds.count() << "s\n";
646 }
647 
648 template <typename Dev>
tensorTensor_m(const Dev & sycl_device)649 void inline tensorTensor_m(const Dev &sycl_device) {
650   typedef float DataType;
651   typedef int64_t IndexType;
652   std::chrono::time_point<std::chrono::system_clock> start, end;
653   start = std::chrono::system_clock::now();
654   // Tensor Tensor Contraction
655   test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
656   test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
657 
658   end = std::chrono::system_clock::now();
659   std::chrono::duration<double> elapsed_seconds = end - start;
660   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
661   std::cout << "tensor tensor tests finished computation at "
662             << std::ctime(&end_time)
663             << "elapsed time: " << elapsed_seconds.count() << "s\n";
664 }
665 
666 template <typename Dev>
tensorTensor_n(const Dev & sycl_device)667 void inline tensorTensor_n(const Dev &sycl_device) {
668   typedef float DataType;
669   typedef int64_t IndexType;
670   std::chrono::time_point<std::chrono::system_clock> start, end;
671   start = std::chrono::system_clock::now();
672   // Tensor Tensor Contraction
673   test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
674   test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
675 
676   end = std::chrono::system_clock::now();
677   std::chrono::duration<double> elapsed_seconds = end - start;
678   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
679   std::cout << "tensor tensor tests finished computation at "
680             << std::ctime(&end_time)
681             << "elapsed time: " << elapsed_seconds.count() << "s\n";
682 }
683 
684 template <typename Dev>
tensorTensor_k(const Dev & sycl_device)685 void inline tensorTensor_k(const Dev &sycl_device) {
686   typedef float DataType;
687   typedef int64_t IndexType;
688   std::chrono::time_point<std::chrono::system_clock> start, end;
689   start = std::chrono::system_clock::now();
690   test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
691   test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
692 
693   end = std::chrono::system_clock::now();
694   std::chrono::duration<double> elapsed_seconds = end - start;
695   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
696   std::cout << "tensor tensor tests finished computation at "
697             << std::ctime(&end_time)
698             << "elapsed time: " << elapsed_seconds.count() << "s\n";
699 }
700 
701 template <typename Dev>
tensorTensor_sizes(const Dev & sycl_device)702 void inline tensorTensor_sizes(const Dev &sycl_device) {
703   typedef float DataType;
704   typedef int64_t IndexType;
705   std::chrono::time_point<std::chrono::system_clock> start, end;
706   start = std::chrono::system_clock::now();
707   // Tensor Tensor Contraction
708   test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
709   test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
710 
711   end = std::chrono::system_clock::now();
712   std::chrono::duration<double> elapsed_seconds = end - start;
713   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
714   std::cout << "tensor tensor tests finished computation at "
715             << std::ctime(&end_time)
716             << "elapsed time: " << elapsed_seconds.count() << "s\n";
717 }
718 template <typename Dev>
vectorVector(const Dev & sycl_device)719 void inline vectorVector(const Dev &sycl_device) {
720   typedef float DataType;
721   typedef int64_t IndexType;
722   std::chrono::time_point<std::chrono::system_clock> start, end;
723   start = std::chrono::system_clock::now();
724   // VECTOR-VECTOR
725   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
726                                                        1025);
727   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
728                                                        1025);
729   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
730                                                        1024);
731   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
732                                                        1024);
733   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
734                                                        1023);
735   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
736                                                        1023);
737 
738   end = std::chrono::system_clock::now();
739   std::chrono::duration<double> elapsed_seconds = end - start;
740   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
741   std::cout << "contracted tensor tests finished computation at "
742             << std::ctime(&end_time)
743             << "elapsed time: " << elapsed_seconds.count() << "s\n";
744 }
745 
746 template <typename Dev>
vectorTensor(const Dev & sycl_device)747 void inline vectorTensor(const Dev &sycl_device) {
748   typedef float DataType;
749   typedef int64_t IndexType;
750   std::chrono::time_point<std::chrono::system_clock> start, end;
751   start = std::chrono::system_clock::now();
752   // Vector-Tensor
753   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
754                                                        1025);
755   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
756                                                        1025);
757   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
758                                                        1024);
759   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
760                                                        1024);
761   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
762                                                        1023);
763   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
764                                                        1023);
765 
766   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
767                                                        4097);
768   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
769                                                        4097);
770   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
771                                                        4096);
772   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
773                                                        4096);
774   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
775                                                        4095);
776   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
777                                                        4095);
778   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
779                                                        32);
780 
781   end = std::chrono::system_clock::now();
782   std::chrono::duration<double> elapsed_seconds = end - start;
783   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
784   std::cout << "finished computation at " << std::ctime(&end_time)
785             << "elapsed time: " << elapsed_seconds.count() << "s\n";
786 }
787 
788 template <typename Dev>
tensorVector(const Dev & sycl_device)789 void inline tensorVector(const Dev &sycl_device) {
790   typedef float DataType;
791   typedef int64_t IndexType;
792   std::chrono::time_point<std::chrono::system_clock> start, end;
793   start = std::chrono::system_clock::now();
794   // Matrix-Vector
795   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
796                                                        1);
797   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
798                                                        1);
799   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
800                                                        1);
801   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
802                                                        1);
803   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
804                                                        1);
805   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
806                                                        1);
807   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
808                                                        1);
809   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
810                                                        1);
811   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
812                                                        1);
813   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
814                                                        1);
815   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
816                                                        1);
817   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
818                                                        1);
819 // If the GEMV disabled it will creates one kernel to calculate the contraction.
820 // Therefore the acumuation of float number will overflow the precision
821 // threshold for float and cause the test to fail. While it the GMV multiple
822 // kernel will be created and each one run the overflow of accumutation breaks
823 // among the kernels.
824 #ifndef EIGEN_SYCL_DISABLE_GEMV
825   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
826                                                        1);
827 #endif
828 
829   end = std::chrono::system_clock::now();
830   std::chrono::duration<double> elapsed_seconds = end - start;
831   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
832   std::cout << "finished computation at " << std::ctime(&end_time)
833             << "elapsed time: " << elapsed_seconds.count() << "s\n";
834 }
835 
836 template <typename Dev>
tensorScalar(const Dev & sycl_device)837 void inline tensorScalar(const Dev &sycl_device) {
838   typedef float DataType;
839   typedef int64_t IndexType;
840   std::chrono::time_point<std::chrono::system_clock> start, end;
841   start = std::chrono::system_clock::now();
842   // SCALAR Contraction
843   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
844   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
845   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
846   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
847   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
848   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
849 
850   end = std::chrono::system_clock::now();
851   std::chrono::duration<double> elapsed_seconds = end - start;
852   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
853   std::cout << "finished computation at " << std::ctime(&end_time)
854             << "elapsed time: " << elapsed_seconds.count() << "s\n";
855 }
856 
857 template <typename Dev>
skinnyTensor_row(const Dev & sycl_device)858 void inline skinnyTensor_row(const Dev &sycl_device) {
859   typedef float DataType;
860   typedef int64_t IndexType;
861   std::chrono::time_point<std::chrono::system_clock> start, end;
862   start = std::chrono::system_clock::now();
863   // Tensor Tensor Contraction
864   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
865   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
866                                                        257);
867   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
868                                                        256);
869   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
870                                                        16);
871   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
872                                                        17);
873   end = std::chrono::system_clock::now();
874   std::chrono::duration<double> elapsed_seconds = end - start;
875   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
876   std::cout << "finished computation at " << std::ctime(&end_time)
877             << "elapsed time: " << elapsed_seconds.count() << "s\n";
878 }
879 
880 template <typename Dev>
skinnyTensor_col(const Dev & sycl_device)881 void inline skinnyTensor_col(const Dev &sycl_device) {
882   typedef float DataType;
883   typedef int64_t IndexType;
884   std::chrono::time_point<std::chrono::system_clock> start, end;
885   start = std::chrono::system_clock::now();
886   // Tensor Tensor Contraction
887   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
888   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
889                                                        257);
890   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
891                                                        256);
892   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
893                                                        16);
894   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
895                                                        17);
896   end = std::chrono::system_clock::now();
897   std::chrono::duration<double> elapsed_seconds = end - start;
898   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
899   std::cout << "finished computation at " << std::ctime(&end_time)
900             << "elapsed time: " << elapsed_seconds.count() << "s\n";
901 }
902 
903 template <typename Dev>
tensor_contraction_batch_per_device(const Dev & sycl_device)904 void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
905   typedef float DataType;
906   typedef int64_t IndexType;
907   std::chrono::time_point<std::chrono::system_clock> start, end;
908   start = std::chrono::system_clock::now();
909 
910   contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
911                                                    0, 4);
912   contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
913                                                    0, 4);
914   end = std::chrono::system_clock::now();
915   std::chrono::duration<double> elapsed_seconds = end - start;
916   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
917   std::cout << "finished computation at " << std::ctime(&end_time)
918             << "elapsed time: " << elapsed_seconds.count() << "s\n";
919 }
920 
921 template <typename Dev>
tensor_contraction_lhs_transposed_per_device(const Dev & sycl_device)922 void inline tensor_contraction_lhs_transposed_per_device(
923     const Dev &sycl_device) {
924   typedef float DataType;
925   typedef int64_t IndexType;
926   std::chrono::time_point<std::chrono::system_clock> start, end;
927   start = std::chrono::system_clock::now();
928 
929   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
930                                                             8);
931   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
932                                                             32);
933   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
934                                                             64);
935   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
936                                                             2048, 1024);
937   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
938                                                             10, 1024);
939   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
940                                                             1024, 1024);
941   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
942                                                             4096, 1024);
943   end = std::chrono::system_clock::now();
944   std::chrono::duration<double> elapsed_seconds = end - start;
945   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
946   std::cout << "finished computation at " << std::ctime(&end_time)
947             << "elapsed time: " << elapsed_seconds.count() << "s\n";
948 }
949 
950 template <typename Dev>
tensor_contraction_rhs_transposed_per_device(const Dev & sycl_device)951 void inline tensor_contraction_rhs_transposed_per_device(
952     const Dev &sycl_device) {
953   typedef float DataType;
954   typedef int64_t IndexType;
955   std::chrono::time_point<std::chrono::system_clock> start, end;
956   start = std::chrono::system_clock::now();
957 
958   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
959                                                             16);
960   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
961                                                             17);
962   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
963                                                             32);
964   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
965                                                             64);
966   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
967                                                             1024, 1024);
968   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
969                                                             1024, 4096);
970   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
971                                                             1024, 2048);
972   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
973                                                             1024, 784);
974   end = std::chrono::system_clock::now();
975   std::chrono::duration<double> elapsed_seconds = end - start;
976   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
977   std::cout << "finished computation at " << std::ctime(&end_time)
978             << "elapsed time: " << elapsed_seconds.count() << "s\n";
979 }
980 
981 template <typename Dev>
tensor_contraction_both_transposed_per_device(const Dev & sycl_device)982 void inline tensor_contraction_both_transposed_per_device(
983     const Dev &sycl_device) {
984   typedef float DataType;
985   typedef int64_t IndexType;
986   std::chrono::time_point<std::chrono::system_clock> start, end;
987   start = std::chrono::system_clock::now();
988 
989   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
990                                                              17);
991   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
992                                                              32);
993   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
994                                                              16, 64);
995   end = std::chrono::system_clock::now();
996   std::chrono::duration<double> elapsed_seconds = end - start;
997   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
998   std::cout << "finished computation at " << std::ctime(&end_time)
999             << "elapsed time: " << elapsed_seconds.count() << "s\n";
1000 }
1001 
EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl)1002 EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
1003   for (const auto &device : Eigen::get_sycl_supported_devices()) {
1004     std::cout << "Running on "
1005               << device.template get_info<cl::sycl::info::device::name>()
1006               << std::endl;
1007     QueueInterface queueInterface(device);
1008     auto sycl_device = Eigen::SyclDevice(&queueInterface);
1009     CALL_SUBTEST_1(tensorOutofBound(sycl_device));
1010     CALL_SUBTEST_2(tensorTensor(sycl_device));
1011     CALL_SUBTEST_2(tensorTensor_m(sycl_device));
1012     CALL_SUBTEST_2(tensorTensor_n(sycl_device));
1013     CALL_SUBTEST_2(tensorTensor_k(sycl_device));
1014     CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
1015     CALL_SUBTEST_3(vectorVector(sycl_device));
1016     CALL_SUBTEST_4(vectorTensor(sycl_device));
1017     CALL_SUBTEST_5(tensorVector(sycl_device));
1018     CALL_SUBTEST_6(tensorScalar(sycl_device));
1019     CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
1020     CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
1021     CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
1022     CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
1023     CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
1024     CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
1025   }
1026 }
1027