1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli Codeplay Software Ltd.
6 // Ralph Potter Codeplay Software Ltd.
7 // Luke Iwanski Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13
14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
16
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
19
20 #include <algorithm>
21 #include <chrono>
22 #include <ctime>
23 #include <iostream>
24
25 #include "main.h"
26
27 #include <unsupported/Eigen/CXX11/Tensor>
28
29 using Eigen::array;
30 using Eigen::SyclDevice;
31 using Eigen::Tensor;
32 using Eigen::TensorMap;
33
34 template <int DataLayout, typename DataType, typename IndexType,
35 typename Device>
test_sycl_contraction(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)36 void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
37 IndexType k_size, IndexType n_size) {
38 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
39 DimPair;
40 static const DataType error_threshold = DataType(1e-4);
41 // with these dimensions, the output has 300 * 140 elements, which is
42 // more than 30 * 1024, which is the number of threads in blocks on
43 // a 15 SM GK110 GPU
44 Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
45 Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
46 Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
47 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
48 Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
49 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
50 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
51 Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
52
53 t_left.setRandom();
54 t_right.setRandom();
55
56 std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
57 std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
58 std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
59
60 DataType *d_t_left =
61 static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
62 DataType *d_t_right =
63 static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
64 DataType *d_t_result =
65 static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
66
67 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
68 gpu_t_left(d_t_left, left_dims);
69 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
70 gpu_t_right(d_t_right, right_dims);
71 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
72 gpu_t_result(d_t_result, result_dims);
73
74 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
75 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
76
77 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
78 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
79 t_result_bytes);
80
81 t_result = t_left.contract(t_right, dims);
82
83 for (IndexType i = 0; i < t_result.size(); i++) {
84 if (static_cast<DataType>(std::fabs(static_cast<DataType>(
85 t_result(i) - t_result_gpu(i)))) < error_threshold) {
86 continue;
87 }
88 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
89 error_threshold)) {
90 continue;
91 }
92
93 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
94 << ", mismatch detected at IndexType " << i << ": " << t_result(i)
95 << " vs " << t_result_gpu(i) << std::endl;
96 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
97 }
98 sycl_device.deallocate(d_t_left);
99 sycl_device.deallocate(d_t_right);
100 sycl_device.deallocate(d_t_result);
101 }
102
103 template <int DataLayout, typename DataType, typename IndexType,
104 typename Device>
test_sycl_contraction_m(const Device & sycl_device)105 void test_sycl_contraction_m(const Device &sycl_device) {
106 for (IndexType k = 32; k < 256; k++) {
107 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
108 128);
109 }
110 }
111
112 template <int DataLayout, typename DataType, typename IndexType,
113 typename Device>
test_sycl_contraction_k(const Device & sycl_device)114 void test_sycl_contraction_k(const Device &sycl_device) {
115 for (IndexType k = 32; k < 256; k++) {
116 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
117 128);
118 }
119 }
120
121 template <int DataLayout, typename DataType, typename IndexType,
122 typename Device>
test_sycl_contraction_n(const Device & sycl_device)123 void test_sycl_contraction_n(const Device &sycl_device) {
124 for (IndexType k = 32; k < 256; k++) {
125 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
126 128, k);
127 }
128 }
129
130 template <int DataLayout, typename DataType, typename IndexType,
131 typename Device>
test_sycl_contraction_sizes(const Device & sycl_device)132 void test_sycl_contraction_sizes(const Device &sycl_device) {
133 IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
134 257, 511, 512, 513, 1023, 1024, 1025};
135
136 IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
137 257, 511, 512, 513, 1023, 1024, 1025};
138
139 IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129,
140 255, 257, 511, 512, 513, 1023, 1024, 1025};
141
142 for (IndexType i = 0; i < 15; i++) {
143 for (IndexType j = 0; j < 15; j++) {
144 for (IndexType k = 0; k < 17; k++) {
145 test_sycl_contraction<DataLayout, DataType, IndexType>(
146 sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
147 }
148 }
149 }
150 }
151
152 template <int DataLayout, typename DataType, typename IndexType,
153 typename Device>
test_no_out_of_bounds(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)154 void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
155 IndexType k_size, IndexType n_size) {
156 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
157 DimPair;
158 static const DataType error_threshold = DataType(1e-4);
159 Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
160 Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
161 Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
162
163 Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
164 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
165 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
166 Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
167
168 t_left.setRandom();
169 t_right.setRandom();
170
171 // Allocate buffers twice as big to check for invalid read and write
172 auto padded_left_size = 2 * t_left.size();
173 auto padded_right_size = 2 * t_right.size();
174 auto padded_result_size = 2 * t_result.size();
175
176 std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
177 std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
178 std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
179
180 DataType *d_t_left =
181 static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
182 DataType *d_t_right =
183 static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
184 DataType *d_t_result =
185 static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
186
187 // TensorMaps are still of the same size than the Tensors
188 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
189 gpu_t_left(d_t_left, left_dims);
190 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
191 gpu_t_right(d_t_right, right_dims);
192 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
193 gpu_t_result(d_t_result, result_dims);
194
195 // Write nan after the actual buffer to propagate nans everywhere in case of
196 // invalid reads
197 DataType nan = std::numeric_limits<DataType>::quiet_NaN();
198 auto host_left_data = new DataType[padded_left_size];
199 std::copy_n(t_left.data(), t_left.size(), host_left_data);
200 std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
201 auto host_right_data = new DataType[padded_right_size];
202 std::copy_n(t_right.data(), t_right.size(), host_right_data);
203 std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
204 auto host_result_data = new DataType[padded_result_size];
205 std::fill_n(host_result_data, padded_result_size, nan);
206
207 sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
208 sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
209 sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
210
211 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
212 sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
213
214 t_result = t_left.contract(t_right, dims);
215
216 for (IndexType i = 0; i < t_result.size(); i++) {
217 if (static_cast<DataType>(std::fabs(static_cast<DataType>(
218 t_result(i) - host_result_data[i]))) < error_threshold) {
219 continue;
220 }
221 if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
222 error_threshold)) {
223 continue;
224 }
225 if (std::isnan(host_result_data[i])) {
226 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
227 << ", invalid read detected at IndexType " << i << ": "
228 << t_result(i) << " vs " << host_result_data[i] << std::endl;
229 } else {
230 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
231 << ", mismatch detected at IndexType " << i << ": "
232 << t_result(i) << " vs " << host_result_data[i] << std::endl;
233 }
234 VERIFY_IS_APPROX(host_result_data[i], t_result(i));
235 }
236 // Make sure that the rest of the result is still nans
237 for (IndexType i = t_result.size(); i < padded_result_size; i++) {
238 if (std::isnan(host_result_data[i])) {
239 continue;
240 }
241 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
242 << ", invalid write detected at IndexType " << i << ": "
243 << host_result_data[i] << std::endl;
244 VERIFY_IS_APPROX(host_result_data[i], t_result(i));
245 }
246 sycl_device.deallocate(d_t_left);
247 sycl_device.deallocate(d_t_right);
248 sycl_device.deallocate(d_t_result);
249
250 delete[] host_left_data;
251 delete[] host_right_data;
252 delete[] host_result_data;
253 }
254
255 template <int DataLayout, typename DataType, typename IndexType,
256 typename Device>
test_scalar(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)257 void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
258 IndexType n_size) {
259 // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
260 // ")" << std::endl;
261 // with these dimensions, the output has 300 * 140 elements, which is
262 // more than 30 * 1024, which is the number of threads in blocks on
263 // a 15 SM GK110 GPU
264 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
265 DimPair;
266 static const DataType error_threshold = DataType(1e-4);
267 Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
268 Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
269 Tensor<DataType, 0, DataLayout, IndexType> t_result;
270 Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
271 Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
272 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
273 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
274 t_left.setRandom();
275 t_right.setRandom();
276
277 std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
278 std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
279 std::size_t t_result_bytes = sizeof(DataType);
280
281 DataType *d_t_left =
282 static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
283 DataType *d_t_right =
284 static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
285 DataType *d_t_result =
286 static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
287
288 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
289 gpu_t_left(d_t_left, left_dims);
290 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
291 gpu_t_right(d_t_right, right_dims);
292 Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
293 gpu_t_result(d_t_result);
294
295 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
296 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
297
298 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
299 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
300 t_result_bytes);
301
302 t_result = t_left.contract(t_right, dims);
303
304 if (static_cast<DataType>(std::fabs(static_cast<DataType>(
305 t_result() - t_result_gpu()))) > error_threshold &&
306 !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
307 std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
308 << " : mismatch detected: " << t_result() << " vs "
309 << t_result_gpu() << std::endl;
310 VERIFY_IS_APPROX(t_result_gpu(), t_result());
311 }
312
313 sycl_device.deallocate(d_t_left);
314 sycl_device.deallocate(d_t_right);
315 sycl_device.deallocate(d_t_result);
316 }
317
318 template <int DataLayout, typename DataType, typename IndexType,
319 typename Device>
contraction_batch(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size,IndexType m_batch,IndexType start,IndexType limit)320 void contraction_batch(const Device &sycl_device, IndexType m_size,
321 IndexType k_size, IndexType n_size, IndexType m_batch,
322 IndexType start, IndexType limit) {
323 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
324 DimPair;
325 static const DataType error_threshold = DataType(1e-4);
326 typedef Eigen::array<IndexType, 3> TensorDim;
327 typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
328 TensorDim left_dims = {{m_batch, k_size, m_size}};
329 TensorDim right_dims = {{m_batch, n_size, k_size}};
330 TensorDim res_dims = {{m_batch, m_size, n_size}};
331 Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
332
333 TensorType t_left(left_dims);
334 TensorType t_right(right_dims);
335 TensorType t_result_gpu(res_dims);
336 TensorType t_result(res_dims);
337
338 t_left.setRandom();
339 t_right.setRandom();
340
341 std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
342 std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
343 std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
344
345 DataType *d_t_left =
346 static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
347 DataType *d_t_right =
348 static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
349 DataType *d_t_result =
350 static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
351
352 Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
353 Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
354 Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
355
356 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
357 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
358 for (int i = start; i < limit; ++i) {
359 auto x = gpu_t_left.template chip<0>(i);
360 auto y = gpu_t_right.template chip<0>(i);
361 auto z = gpu_t_result.template chip<0>(i);
362 z.device(sycl_device) = x.contract(y, contract_pairs);
363 }
364 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
365 t_result_bytes);
366
367 for (int i = start; i < limit; ++i) {
368 auto x = t_left.template chip<0>(i);
369 auto y = t_right.template chip<0>(i);
370 auto z = t_result.template chip<0>(i);
371 z = x.contract(y, contract_pairs);
372 }
373
374 for (IndexType i = 0; i < t_result.size(); i++) {
375 if (static_cast<DataType>(std::fabs(static_cast<DataType>(
376 t_result(i) - t_result_gpu(i)))) < error_threshold) {
377 continue;
378 }
379 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
380 error_threshold)) {
381 continue;
382 }
383 std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
384 << " vs " << t_result_gpu(i) << std::endl;
385 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
386 }
387 sycl_device.deallocate(d_t_left);
388 sycl_device.deallocate(d_t_right);
389 sycl_device.deallocate(d_t_result);
390 }
391
392 template <int DataLayout, typename DataType, typename IndexType,
393 typename Device>
contraction_rhs_transposed(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)394 void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
395 IndexType k_size, IndexType n_size) {
396 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
397 DimPair;
398 static const DataType error_threshold = DataType(1e-4);
399 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
400 Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
401 Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
402 Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
403
404 Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
405 Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
406 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
407 Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
408
409 t_left.setRandom();
410 t_right.setRandom();
411
412 std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
413 std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
414 std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
415
416 DataType *d_t_left =
417 static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
418 DataType *d_t_right =
419 static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
420 DataType *d_t_result =
421 static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
422
423 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
424 gpu_t_left(d_t_left, left_dims);
425 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
426 gpu_t_right(d_t_right, right_dims);
427 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
428 gpu_t_result(d_t_result, res_dims);
429
430 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
431 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
432
433 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
434 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
435 t_result_bytes);
436
437 t_result = t_left.contract(t_right, dims);
438
439 for (IndexType j = 0; j < m_size; j++) {
440 for (IndexType i = 0; i < n_size; i++) {
441 if (static_cast<DataType>(std::fabs(static_cast<DataType>(
442 t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
443 continue;
444 }
445 if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
446 error_threshold)) {
447 continue;
448 }
449 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
450 << ", mismatch detected at IndexType m: " << j << " n: " << i
451 << " CPU : " << t_result(j, i)
452 << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
453 VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
454 }
455 }
456 sycl_device.deallocate(d_t_left);
457 sycl_device.deallocate(d_t_right);
458 sycl_device.deallocate(d_t_result);
459 }
460
461 template <int DataLayout, typename DataType, typename IndexType,
462 typename Device>
contraction_lhs_transposed(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)463 void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
464 IndexType k_size, IndexType n_size) {
465 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
466 DimPair;
467 static const DataType error_threshold = DataType(1e-4);
468 Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
469 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
470 Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
471 Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
472
473 Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
474 Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
475 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
476 Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
477
478 t_left.setRandom();
479 t_right.setRandom();
480
481 std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
482 std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
483 std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
484
485 DataType *d_t_left =
486 static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
487 DataType *d_t_right =
488 static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
489 DataType *d_t_result =
490 static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
491
492 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
493 gpu_t_left(d_t_left, left_dims);
494 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
495 gpu_t_right(d_t_right, right_dims);
496 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
497 gpu_t_result(d_t_result, res_dims);
498
499 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
500 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
501
502 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
503 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
504 t_result_bytes);
505
506 t_result = t_left.contract(t_right, dims);
507
508 for (IndexType i = 0; i < t_result.size(); i++) {
509 if (static_cast<DataType>(std::fabs(static_cast<DataType>(
510 t_result(i) - t_result_gpu(i)))) < error_threshold) {
511 continue;
512 }
513 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
514 error_threshold)) {
515 continue;
516 }
517 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
518 << ", mismatch detected at IndexType " << i << ": " << t_result(i)
519 << " vs " << t_result_gpu(i) << std::endl;
520 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
521 }
522 sycl_device.deallocate(d_t_left);
523 sycl_device.deallocate(d_t_right);
524 sycl_device.deallocate(d_t_result);
525 }
526
527 template <int DataLayout, typename DataType, typename IndexType,
528 typename Device>
contraction_both_transposed(const Device & sycl_device,IndexType m_size,IndexType k_size,IndexType n_size)529 void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
530 IndexType k_size, IndexType n_size) {
531 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
532 DimPair;
533 static const DataType error_threshold = DataType(1e-4);
534 Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
535 Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
536 Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
537 Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
538
539 Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
540 Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
541 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
542 Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
543
544 t_left.setRandom();
545 t_right.setRandom();
546
547 std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
548 std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
549 std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
550
551 DataType *d_t_left =
552 static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
553 DataType *d_t_right =
554 static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
555 DataType *d_t_result =
556 static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
557
558 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
559 gpu_t_left(d_t_left, left_dims);
560 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
561 gpu_t_right(d_t_right, right_dims);
562 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
563 gpu_t_result(d_t_result, res_dims);
564
565 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
566 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
567
568 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
569 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
570 t_result_bytes);
571
572 t_result = t_left.contract(t_right, dims);
573
574 for (IndexType i = 0; i < t_result.size(); i++) {
575 if (static_cast<DataType>(std::fabs(static_cast<DataType>(
576 t_result(i) - t_result_gpu(i)))) < error_threshold) {
577 continue;
578 }
579 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
580 error_threshold)) {
581 continue;
582 }
583 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
584 << ", mismatch detected at IndexType " << i << ": " << t_result(i)
585 << " vs " << t_result_gpu(i) << std::endl;
586
587 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
588 }
589 sycl_device.deallocate(d_t_left);
590 sycl_device.deallocate(d_t_right);
591 sycl_device.deallocate(d_t_result);
592 }
593
594 template <typename Dev>
tensorOutofBound(const Dev & sycl_device)595 void inline tensorOutofBound(const Dev &sycl_device) {
596 typedef float DataType;
597 typedef int64_t IndexType;
598 std::chrono::time_point<std::chrono::system_clock> start, end;
599 start = std::chrono::system_clock::now();
600 // Test out of bound for Tensor-Tensor
601 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
602 1024);
603 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
604 4096);
605 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
606 2048);
607 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
608 1024);
609 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
610 784);
611 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
612 10);
613 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
614 513);
615 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
616 783);
617 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
618 784);
619 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
620 11);
621 end = std::chrono::system_clock::now();
622 std::chrono::duration<double> elapsed_seconds = end - start;
623 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
624 std::cout << "tensor out of bound tests finished computation at "
625 << std::ctime(&end_time)
626 << "elapsed time: " << elapsed_seconds.count() << "s\n";
627 }
628
629 template <typename Dev>
tensorTensor(const Dev & sycl_device)630 void inline tensorTensor(const Dev &sycl_device) {
631 typedef float DataType;
632 typedef int64_t IndexType;
633 std::chrono::time_point<std::chrono::system_clock> start, end;
634 start = std::chrono::system_clock::now();
635 // Tensor Tensor Contraction
636 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
637 128);
638 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
639 128);
640 end = std::chrono::system_clock::now();
641 std::chrono::duration<double> elapsed_seconds = end - start;
642 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
643 std::cout << "tensor tensor tests finished computation at "
644 << std::ctime(&end_time)
645 << "elapsed time: " << elapsed_seconds.count() << "s\n";
646 }
647
648 template <typename Dev>
tensorTensor_m(const Dev & sycl_device)649 void inline tensorTensor_m(const Dev &sycl_device) {
650 typedef float DataType;
651 typedef int64_t IndexType;
652 std::chrono::time_point<std::chrono::system_clock> start, end;
653 start = std::chrono::system_clock::now();
654 // Tensor Tensor Contraction
655 test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
656 test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
657
658 end = std::chrono::system_clock::now();
659 std::chrono::duration<double> elapsed_seconds = end - start;
660 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
661 std::cout << "tensor tensor tests finished computation at "
662 << std::ctime(&end_time)
663 << "elapsed time: " << elapsed_seconds.count() << "s\n";
664 }
665
666 template <typename Dev>
tensorTensor_n(const Dev & sycl_device)667 void inline tensorTensor_n(const Dev &sycl_device) {
668 typedef float DataType;
669 typedef int64_t IndexType;
670 std::chrono::time_point<std::chrono::system_clock> start, end;
671 start = std::chrono::system_clock::now();
672 // Tensor Tensor Contraction
673 test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
674 test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
675
676 end = std::chrono::system_clock::now();
677 std::chrono::duration<double> elapsed_seconds = end - start;
678 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
679 std::cout << "tensor tensor tests finished computation at "
680 << std::ctime(&end_time)
681 << "elapsed time: " << elapsed_seconds.count() << "s\n";
682 }
683
684 template <typename Dev>
tensorTensor_k(const Dev & sycl_device)685 void inline tensorTensor_k(const Dev &sycl_device) {
686 typedef float DataType;
687 typedef int64_t IndexType;
688 std::chrono::time_point<std::chrono::system_clock> start, end;
689 start = std::chrono::system_clock::now();
690 test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
691 test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
692
693 end = std::chrono::system_clock::now();
694 std::chrono::duration<double> elapsed_seconds = end - start;
695 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
696 std::cout << "tensor tensor tests finished computation at "
697 << std::ctime(&end_time)
698 << "elapsed time: " << elapsed_seconds.count() << "s\n";
699 }
700
701 template <typename Dev>
tensorTensor_sizes(const Dev & sycl_device)702 void inline tensorTensor_sizes(const Dev &sycl_device) {
703 typedef float DataType;
704 typedef int64_t IndexType;
705 std::chrono::time_point<std::chrono::system_clock> start, end;
706 start = std::chrono::system_clock::now();
707 // Tensor Tensor Contraction
708 test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
709 test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
710
711 end = std::chrono::system_clock::now();
712 std::chrono::duration<double> elapsed_seconds = end - start;
713 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
714 std::cout << "tensor tensor tests finished computation at "
715 << std::ctime(&end_time)
716 << "elapsed time: " << elapsed_seconds.count() << "s\n";
717 }
718 template <typename Dev>
vectorVector(const Dev & sycl_device)719 void inline vectorVector(const Dev &sycl_device) {
720 typedef float DataType;
721 typedef int64_t IndexType;
722 std::chrono::time_point<std::chrono::system_clock> start, end;
723 start = std::chrono::system_clock::now();
724 // VECTOR-VECTOR
725 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
726 1025);
727 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
728 1025);
729 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
730 1024);
731 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
732 1024);
733 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
734 1023);
735 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
736 1023);
737
738 end = std::chrono::system_clock::now();
739 std::chrono::duration<double> elapsed_seconds = end - start;
740 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
741 std::cout << "contracted tensor tests finished computation at "
742 << std::ctime(&end_time)
743 << "elapsed time: " << elapsed_seconds.count() << "s\n";
744 }
745
746 template <typename Dev>
vectorTensor(const Dev & sycl_device)747 void inline vectorTensor(const Dev &sycl_device) {
748 typedef float DataType;
749 typedef int64_t IndexType;
750 std::chrono::time_point<std::chrono::system_clock> start, end;
751 start = std::chrono::system_clock::now();
752 // Vector-Tensor
753 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
754 1025);
755 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
756 1025);
757 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
758 1024);
759 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
760 1024);
761 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
762 1023);
763 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
764 1023);
765
766 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
767 4097);
768 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
769 4097);
770 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
771 4096);
772 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
773 4096);
774 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
775 4095);
776 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
777 4095);
778 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
779 32);
780
781 end = std::chrono::system_clock::now();
782 std::chrono::duration<double> elapsed_seconds = end - start;
783 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
784 std::cout << "finished computation at " << std::ctime(&end_time)
785 << "elapsed time: " << elapsed_seconds.count() << "s\n";
786 }
787
788 template <typename Dev>
tensorVector(const Dev & sycl_device)789 void inline tensorVector(const Dev &sycl_device) {
790 typedef float DataType;
791 typedef int64_t IndexType;
792 std::chrono::time_point<std::chrono::system_clock> start, end;
793 start = std::chrono::system_clock::now();
794 // Matrix-Vector
795 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
796 1);
797 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
798 1);
799 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
800 1);
801 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
802 1);
803 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
804 1);
805 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
806 1);
807 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
808 1);
809 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
810 1);
811 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
812 1);
813 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
814 1);
815 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
816 1);
817 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
818 1);
819 // If the GEMV disabled it will creates one kernel to calculate the contraction.
820 // Therefore the acumuation of float number will overflow the precision
821 // threshold for float and cause the test to fail. While it the GMV multiple
822 // kernel will be created and each one run the overflow of accumutation breaks
823 // among the kernels.
824 #ifndef EIGEN_SYCL_DISABLE_GEMV
825 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
826 1);
827 #endif
828
829 end = std::chrono::system_clock::now();
830 std::chrono::duration<double> elapsed_seconds = end - start;
831 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
832 std::cout << "finished computation at " << std::ctime(&end_time)
833 << "elapsed time: " << elapsed_seconds.count() << "s\n";
834 }
835
836 template <typename Dev>
tensorScalar(const Dev & sycl_device)837 void inline tensorScalar(const Dev &sycl_device) {
838 typedef float DataType;
839 typedef int64_t IndexType;
840 std::chrono::time_point<std::chrono::system_clock> start, end;
841 start = std::chrono::system_clock::now();
842 // SCALAR Contraction
843 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
844 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
845 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
846 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
847 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
848 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
849
850 end = std::chrono::system_clock::now();
851 std::chrono::duration<double> elapsed_seconds = end - start;
852 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
853 std::cout << "finished computation at " << std::ctime(&end_time)
854 << "elapsed time: " << elapsed_seconds.count() << "s\n";
855 }
856
857 template <typename Dev>
skinnyTensor_row(const Dev & sycl_device)858 void inline skinnyTensor_row(const Dev &sycl_device) {
859 typedef float DataType;
860 typedef int64_t IndexType;
861 std::chrono::time_point<std::chrono::system_clock> start, end;
862 start = std::chrono::system_clock::now();
863 // Tensor Tensor Contraction
864 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
865 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
866 257);
867 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
868 256);
869 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
870 16);
871 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
872 17);
873 end = std::chrono::system_clock::now();
874 std::chrono::duration<double> elapsed_seconds = end - start;
875 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
876 std::cout << "finished computation at " << std::ctime(&end_time)
877 << "elapsed time: " << elapsed_seconds.count() << "s\n";
878 }
879
880 template <typename Dev>
skinnyTensor_col(const Dev & sycl_device)881 void inline skinnyTensor_col(const Dev &sycl_device) {
882 typedef float DataType;
883 typedef int64_t IndexType;
884 std::chrono::time_point<std::chrono::system_clock> start, end;
885 start = std::chrono::system_clock::now();
886 // Tensor Tensor Contraction
887 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
888 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
889 257);
890 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
891 256);
892 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
893 16);
894 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
895 17);
896 end = std::chrono::system_clock::now();
897 std::chrono::duration<double> elapsed_seconds = end - start;
898 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
899 std::cout << "finished computation at " << std::ctime(&end_time)
900 << "elapsed time: " << elapsed_seconds.count() << "s\n";
901 }
902
903 template <typename Dev>
tensor_contraction_batch_per_device(const Dev & sycl_device)904 void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
905 typedef float DataType;
906 typedef int64_t IndexType;
907 std::chrono::time_point<std::chrono::system_clock> start, end;
908 start = std::chrono::system_clock::now();
909
910 contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
911 0, 4);
912 contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
913 0, 4);
914 end = std::chrono::system_clock::now();
915 std::chrono::duration<double> elapsed_seconds = end - start;
916 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
917 std::cout << "finished computation at " << std::ctime(&end_time)
918 << "elapsed time: " << elapsed_seconds.count() << "s\n";
919 }
920
921 template <typename Dev>
tensor_contraction_lhs_transposed_per_device(const Dev & sycl_device)922 void inline tensor_contraction_lhs_transposed_per_device(
923 const Dev &sycl_device) {
924 typedef float DataType;
925 typedef int64_t IndexType;
926 std::chrono::time_point<std::chrono::system_clock> start, end;
927 start = std::chrono::system_clock::now();
928
929 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
930 8);
931 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
932 32);
933 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
934 64);
935 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
936 2048, 1024);
937 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
938 10, 1024);
939 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
940 1024, 1024);
941 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
942 4096, 1024);
943 end = std::chrono::system_clock::now();
944 std::chrono::duration<double> elapsed_seconds = end - start;
945 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
946 std::cout << "finished computation at " << std::ctime(&end_time)
947 << "elapsed time: " << elapsed_seconds.count() << "s\n";
948 }
949
950 template <typename Dev>
tensor_contraction_rhs_transposed_per_device(const Dev & sycl_device)951 void inline tensor_contraction_rhs_transposed_per_device(
952 const Dev &sycl_device) {
953 typedef float DataType;
954 typedef int64_t IndexType;
955 std::chrono::time_point<std::chrono::system_clock> start, end;
956 start = std::chrono::system_clock::now();
957
958 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
959 16);
960 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
961 17);
962 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
963 32);
964 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
965 64);
966 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
967 1024, 1024);
968 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
969 1024, 4096);
970 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
971 1024, 2048);
972 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
973 1024, 784);
974 end = std::chrono::system_clock::now();
975 std::chrono::duration<double> elapsed_seconds = end - start;
976 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
977 std::cout << "finished computation at " << std::ctime(&end_time)
978 << "elapsed time: " << elapsed_seconds.count() << "s\n";
979 }
980
981 template <typename Dev>
tensor_contraction_both_transposed_per_device(const Dev & sycl_device)982 void inline tensor_contraction_both_transposed_per_device(
983 const Dev &sycl_device) {
984 typedef float DataType;
985 typedef int64_t IndexType;
986 std::chrono::time_point<std::chrono::system_clock> start, end;
987 start = std::chrono::system_clock::now();
988
989 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
990 17);
991 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
992 32);
993 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
994 16, 64);
995 end = std::chrono::system_clock::now();
996 std::chrono::duration<double> elapsed_seconds = end - start;
997 std::time_t end_time = std::chrono::system_clock::to_time_t(end);
998 std::cout << "finished computation at " << std::ctime(&end_time)
999 << "elapsed time: " << elapsed_seconds.count() << "s\n";
1000 }
1001
EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl)1002 EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
1003 for (const auto &device : Eigen::get_sycl_supported_devices()) {
1004 std::cout << "Running on "
1005 << device.template get_info<cl::sycl::info::device::name>()
1006 << std::endl;
1007 QueueInterface queueInterface(device);
1008 auto sycl_device = Eigen::SyclDevice(&queueInterface);
1009 CALL_SUBTEST_1(tensorOutofBound(sycl_device));
1010 CALL_SUBTEST_2(tensorTensor(sycl_device));
1011 CALL_SUBTEST_2(tensorTensor_m(sycl_device));
1012 CALL_SUBTEST_2(tensorTensor_n(sycl_device));
1013 CALL_SUBTEST_2(tensorTensor_k(sycl_device));
1014 CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
1015 CALL_SUBTEST_3(vectorVector(sycl_device));
1016 CALL_SUBTEST_4(vectorTensor(sycl_device));
1017 CALL_SUBTEST_5(tensorVector(sycl_device));
1018 CALL_SUBTEST_6(tensorScalar(sycl_device));
1019 CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
1020 CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
1021 CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
1022 CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
1023 CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
1024 CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
1025 }
1026 }
1027