1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015
5 // Mehdi Goli Codeplay Software Ltd.
6 // Ralph Potter Codeplay Software Ltd.
7 // Luke Iwanski Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13
14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
16
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
19 #define EIGEN_HAS_CONSTEXPR 1
20
21 #include "main.h"
22
23 #include <unsupported/Eigen/CXX11/Tensor>
24
25 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device)26 static void test_full_reductions_sum_sycl(
27 const Eigen::SyclDevice& sycl_device) {
28 const IndexType num_rows = 753;
29 const IndexType num_cols = 537;
30 array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
31
32 array<IndexType, 2> outRange = {{1, 1}};
33
34 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
35 Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
36 Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
37
38 in.setRandom();
39 auto dim = DSizes<IndexType, 2>(1, 1);
40 full_redux = in.sum().reshape(dim);
41
42 DataType* gpu_in_data = static_cast<DataType*>(
43 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
44 DataType* gpu_out_data = (DataType*)sycl_device.allocate(
45 sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
46
47 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
48 tensorRange);
49 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
50 outRange);
51 sycl_device.memcpyHostToDevice(
52 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
53 out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
54 sycl_device.memcpyDeviceToHost(
55 full_redux_gpu.data(), gpu_out_data,
56 (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
57 // Check that the CPU and GPU reductions return the same result.
58 std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
59 << ", CPU FULL: " << full_redux(0, 0) << "\n";
60 VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
61 sycl_device.deallocate(gpu_in_data);
62 sycl_device.deallocate(gpu_out_data);
63 }
64
65 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_sum_with_offset_sycl(const Eigen::SyclDevice & sycl_device)66 static void test_full_reductions_sum_with_offset_sycl(
67 const Eigen::SyclDevice& sycl_device) {
68 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
69 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
70 const IndexType num_rows = 64;
71 const IndexType num_cols = 64;
72 array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
73 const IndexType n_elems = internal::array_prod(tensor_range);
74
75 data_tensor in(tensor_range);
76 scalar_tensor full_redux;
77 scalar_tensor full_redux_gpu;
78
79 in.setRandom();
80 array<IndexType, 2> tensor_offset_range(tensor_range);
81 tensor_offset_range[0] -= 1;
82
83 const IndexType offset = 64;
84 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
85 full_redux = in_offset.sum();
86
87 DataType* gpu_in_data =
88 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
89 DataType* gpu_out_data =
90 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
91
92 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
93 TensorMap<scalar_tensor> out_gpu(gpu_out_data);
94 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
95 n_elems * sizeof(DataType));
96 out_gpu.device(sycl_device) = in_gpu.sum();
97 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
98 sizeof(DataType));
99
100 // Check that the CPU and GPU reductions return the same result.
101 VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
102
103 sycl_device.deallocate(gpu_in_data);
104 sycl_device.deallocate(gpu_out_data);
105 }
106
107 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_max_sycl(const Eigen::SyclDevice & sycl_device)108 static void test_full_reductions_max_sycl(
109 const Eigen::SyclDevice& sycl_device) {
110 const IndexType num_rows = 4096;
111 const IndexType num_cols = 4096;
112 array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
113
114 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
115 Tensor<DataType, 0, DataLayout, IndexType> full_redux;
116 Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
117
118 in.setRandom();
119
120 full_redux = in.maximum();
121
122 DataType* gpu_in_data = static_cast<DataType*>(
123 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
124 DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
125
126 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
127 tensorRange);
128 TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
129 sycl_device.memcpyHostToDevice(
130 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
131 out_gpu.device(sycl_device) = in_gpu.maximum();
132 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
133 sizeof(DataType));
134 VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
135 sycl_device.deallocate(gpu_in_data);
136 sycl_device.deallocate(gpu_out_data);
137 }
138
139 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_max_with_offset_sycl(const Eigen::SyclDevice & sycl_device)140 static void test_full_reductions_max_with_offset_sycl(
141 const Eigen::SyclDevice& sycl_device) {
142 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
143 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
144 const IndexType num_rows = 64;
145 const IndexType num_cols = 64;
146 array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
147 const IndexType n_elems = internal::array_prod(tensor_range);
148
149 data_tensor in(tensor_range);
150 scalar_tensor full_redux;
151 scalar_tensor full_redux_gpu;
152
153 in.setRandom();
154 array<IndexType, 2> tensor_offset_range(tensor_range);
155 tensor_offset_range[0] -= 1;
156 // Set the initial value to be the max.
157 // As we don't include this in the reduction the result should not be 2.
158 in(0) = static_cast<DataType>(2);
159
160 const IndexType offset = 64;
161 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
162 full_redux = in_offset.maximum();
163 VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
164
165 DataType* gpu_in_data =
166 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
167 DataType* gpu_out_data =
168 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
169
170 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
171 TensorMap<scalar_tensor> out_gpu(gpu_out_data);
172 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
173 n_elems * sizeof(DataType));
174 out_gpu.device(sycl_device) = in_gpu.maximum();
175 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
176 sizeof(DataType));
177
178 // Check that the CPU and GPU reductions return the same result.
179 VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
180
181 sycl_device.deallocate(gpu_in_data);
182 sycl_device.deallocate(gpu_out_data);
183 }
184
185 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)186 static void test_full_reductions_mean_sycl(
187 const Eigen::SyclDevice& sycl_device) {
188 const IndexType num_rows = 4096;
189 const IndexType num_cols = 4096;
190 array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
191 array<IndexType, 1> argRange = {{num_cols}};
192 Eigen::array<IndexType, 1> red_axis;
193 red_axis[0] = 0;
194 // red_axis[1]=1;
195 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
196 Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
197 Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
198 Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
199 Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
200 Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
201 Tensor<DataType, 0, DataLayout, IndexType> full_redux;
202 Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
203
204 in.setRandom();
205 in_arg1.setRandom();
206 in_arg2.setRandom();
207
208 DataType* gpu_in_data = static_cast<DataType*>(
209 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
210 DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
211 in_arg1.dimensions().TotalSize() * sizeof(DataType)));
212 DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
213 in_arg2.dimensions().TotalSize() * sizeof(DataType)));
214 bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
215 out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
216 bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
217 out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
218
219 DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
220
221 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
222 tensorRange);
223 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
224 gpu_in_arg1_data, tensorRange);
225 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
226 gpu_in_arg2_data, tensorRange);
227 TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
228 gpu_out_arg_data, argRange);
229 TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
230 gpu_out_arg__gpu_helper_data, argRange);
231 TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
232
233 // CPU VERSION
234 out_arg_cpu =
235 (in_arg1.argmax(1) == in_arg2.argmax(1))
236 .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
237 full_redux = (out_arg_cpu.template cast<float>())
238 .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
239
240 // GPU VERSION
241 sycl_device.memcpyHostToDevice(
242 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
243 sycl_device.memcpyHostToDevice(
244 gpu_in_arg1_data, in_arg1.data(),
245 (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
246 sycl_device.memcpyHostToDevice(
247 gpu_in_arg2_data, in_arg2.data(),
248 (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
249 out_Argout_gpu_helper.device(sycl_device) =
250 (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
251 out_Argout_gpu.device(sycl_device) =
252 (out_Argout_gpu_helper)
253 .select(out_Argout_gpu.constant(true),
254 out_Argout_gpu.constant(false));
255 out_gpu.device(sycl_device) =
256 (out_Argout_gpu.template cast<float>())
257 .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
258 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
259 sizeof(DataType));
260 // Check that the CPU and GPU reductions return the same result.
261 std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
262 << '\n';
263 VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
264 sycl_device.deallocate(gpu_in_data);
265 sycl_device.deallocate(gpu_in_arg1_data);
266 sycl_device.deallocate(gpu_in_arg2_data);
267 sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
268 sycl_device.deallocate(gpu_out_arg_data);
269 sycl_device.deallocate(gpu_out_data);
270 }
271
272 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_mean_with_offset_sycl(const Eigen::SyclDevice & sycl_device)273 static void test_full_reductions_mean_with_offset_sycl(
274 const Eigen::SyclDevice& sycl_device) {
275 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
276 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
277 const IndexType num_rows = 64;
278 const IndexType num_cols = 64;
279 array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
280 const IndexType n_elems = internal::array_prod(tensor_range);
281
282 data_tensor in(tensor_range);
283 scalar_tensor full_redux;
284 scalar_tensor full_redux_gpu;
285
286 in.setRandom();
287 array<IndexType, 2> tensor_offset_range(tensor_range);
288 tensor_offset_range[0] -= 1;
289
290 const IndexType offset = 64;
291 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
292 full_redux = in_offset.mean();
293 VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
294
295 DataType* gpu_in_data =
296 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
297 DataType* gpu_out_data =
298 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
299
300 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
301 TensorMap<scalar_tensor> out_gpu(gpu_out_data);
302 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
303 n_elems * sizeof(DataType));
304 out_gpu.device(sycl_device) = in_gpu.mean();
305 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
306 sizeof(DataType));
307
308 // Check that the CPU and GPU reductions return the same result.
309 VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
310
311 sycl_device.deallocate(gpu_in_data);
312 sycl_device.deallocate(gpu_out_data);
313 }
314
315 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_mean_with_odd_offset_sycl(const Eigen::SyclDevice & sycl_device)316 static void test_full_reductions_mean_with_odd_offset_sycl(
317 const Eigen::SyclDevice& sycl_device) {
318 // This is a particular case which illustrates a possible problem when the
319 // number of local threads in a workgroup is even, but is not a power of two.
320 using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
321 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
322 // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
323 // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
324 const IndexType n_elems = 8707;
325 array<IndexType, 1> tensor_range = {{n_elems}};
326
327 data_tensor in(tensor_range);
328 DataType full_redux;
329 DataType full_redux_gpu;
330 TensorMap<scalar_tensor> red_cpu(&full_redux);
331 TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
332
333 const DataType const_val = static_cast<DataType>(0.6391);
334 in = in.constant(const_val);
335
336 Eigen::IndexList<Eigen::type2index<0>> red_axis;
337 red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
338 VERIFY_IS_APPROX(const_val, red_cpu());
339
340 DataType* gpu_in_data =
341 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
342 DataType* gpu_out_data =
343 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
344
345 TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
346 TensorMap<scalar_tensor> out_gpu(gpu_out_data);
347 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
348 n_elems * sizeof(DataType));
349 out_gpu.device(sycl_device) =
350 in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
351 sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
352 sizeof(DataType));
353
354 // Check that the CPU and GPU reductions return the same result.
355 VERIFY_IS_APPROX(full_redux_gpu, full_redux);
356
357 sycl_device.deallocate(gpu_in_data);
358 sycl_device.deallocate(gpu_out_data);
359 }
360
361 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_min_sycl(const Eigen::SyclDevice & sycl_device)362 static void test_full_reductions_min_sycl(
363 const Eigen::SyclDevice& sycl_device) {
364 const IndexType num_rows = 876;
365 const IndexType num_cols = 953;
366 array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
367
368 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
369 Tensor<DataType, 0, DataLayout, IndexType> full_redux;
370 Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
371
372 in.setRandom();
373
374 full_redux = in.minimum();
375
376 DataType* gpu_in_data = static_cast<DataType*>(
377 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
378 DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
379
380 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
381 tensorRange);
382 TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
383
384 sycl_device.memcpyHostToDevice(
385 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
386 out_gpu.device(sycl_device) = in_gpu.minimum();
387 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
388 sizeof(DataType));
389 // Check that the CPU and GPU reductions return the same result.
390 VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
391 sycl_device.deallocate(gpu_in_data);
392 sycl_device.deallocate(gpu_out_data);
393 }
394
395 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_min_with_offset_sycl(const Eigen::SyclDevice & sycl_device)396 static void test_full_reductions_min_with_offset_sycl(
397 const Eigen::SyclDevice& sycl_device) {
398 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
399 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
400 const IndexType num_rows = 64;
401 const IndexType num_cols = 64;
402 array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
403 const IndexType n_elems = internal::array_prod(tensor_range);
404
405 data_tensor in(tensor_range);
406 scalar_tensor full_redux;
407 scalar_tensor full_redux_gpu;
408
409 in.setRandom();
410 array<IndexType, 2> tensor_offset_range(tensor_range);
411 tensor_offset_range[0] -= 1;
412 // Set the initial value to be the min.
413 // As we don't include this in the reduction the result should not be -2.
414 in(0) = static_cast<DataType>(-2);
415
416 const IndexType offset = 64;
417 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
418 full_redux = in_offset.minimum();
419 VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
420
421 DataType* gpu_in_data =
422 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
423 DataType* gpu_out_data =
424 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
425
426 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
427 TensorMap<scalar_tensor> out_gpu(gpu_out_data);
428 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
429 n_elems * sizeof(DataType));
430 out_gpu.device(sycl_device) = in_gpu.minimum();
431 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
432 sizeof(DataType));
433
434 // Check that the CPU and GPU reductions return the same result.
435 VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
436
437 sycl_device.deallocate(gpu_in_data);
438 sycl_device.deallocate(gpu_out_data);
439 }
440 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_max_sycl(const Eigen::SyclDevice & sycl_device)441 static void test_first_dim_reductions_max_sycl(
442 const Eigen::SyclDevice& sycl_device) {
443 IndexType dim_x = 145;
444 IndexType dim_y = 1;
445 IndexType dim_z = 67;
446
447 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
448 Eigen::array<IndexType, 1> red_axis;
449 red_axis[0] = 0;
450 array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
451
452 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
453 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
454 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
455
456 in.setRandom();
457
458 redux = in.maximum(red_axis);
459
460 DataType* gpu_in_data = static_cast<DataType*>(
461 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
462 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
463 redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
464
465 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
466 tensorRange);
467 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
468 gpu_out_data, reduced_tensorRange);
469
470 sycl_device.memcpyHostToDevice(
471 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
472 out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
473 sycl_device.memcpyDeviceToHost(
474 redux_gpu.data(), gpu_out_data,
475 redux_gpu.dimensions().TotalSize() * sizeof(DataType));
476
477 // Check that the CPU and GPU reductions return the same result.
478 for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
479 for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
480 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
481
482 sycl_device.deallocate(gpu_in_data);
483 sycl_device.deallocate(gpu_out_data);
484 }
485
486 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_max_with_offset_sycl(const Eigen::SyclDevice & sycl_device)487 static void test_first_dim_reductions_max_with_offset_sycl(
488 const Eigen::SyclDevice& sycl_device) {
489 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
490 using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
491
492 const IndexType num_rows = 64;
493 const IndexType num_cols = 64;
494 array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
495 array<IndexType, 1> reduced_range = {{num_cols}};
496 const IndexType n_elems = internal::array_prod(tensor_range);
497 const IndexType n_reduced = num_cols;
498
499 data_tensor in(tensor_range);
500 reduced_tensor redux;
501 reduced_tensor redux_gpu(reduced_range);
502
503 in.setRandom();
504 array<IndexType, 2> tensor_offset_range(tensor_range);
505 tensor_offset_range[0] -= 1;
506 // Set maximum value outside of the considered range.
507 for (IndexType i = 0; i < n_reduced; i++) {
508 in(i) = static_cast<DataType>(2);
509 }
510
511 Eigen::array<IndexType, 1> red_axis;
512 red_axis[0] = 0;
513
514 const IndexType offset = 64;
515 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
516 redux = in_offset.maximum(red_axis);
517 for (IndexType i = 0; i < n_reduced; i++) {
518 VERIFY_IS_NOT_EQUAL(redux(i), in(i));
519 }
520
521 DataType* gpu_in_data =
522 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
523 DataType* gpu_out_data = static_cast<DataType*>(
524 sycl_device.allocate(n_reduced * sizeof(DataType)));
525
526 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
527 TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
528 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
529 n_elems * sizeof(DataType));
530 out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
531 sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
532 n_reduced * sizeof(DataType));
533
534 // Check that the CPU and GPU reductions return the same result.
535 for (IndexType i = 0; i < n_reduced; i++) {
536 VERIFY_IS_APPROX(redux_gpu(i), redux(i));
537 }
538
539 sycl_device.deallocate(gpu_in_data);
540 sycl_device.deallocate(gpu_out_data);
541 }
542
543 template <typename DataType, int DataLayout, typename IndexType>
test_last_dim_reductions_max_with_offset_sycl(const Eigen::SyclDevice & sycl_device)544 static void test_last_dim_reductions_max_with_offset_sycl(
545 const Eigen::SyclDevice& sycl_device) {
546 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
547 using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
548
549 const IndexType num_rows = 64;
550 const IndexType num_cols = 64;
551 array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
552 array<IndexType, 1> full_reduced_range = {{num_rows}};
553 array<IndexType, 1> reduced_range = {{num_rows - 1}};
554 const IndexType n_elems = internal::array_prod(tensor_range);
555 const IndexType n_reduced = reduced_range[0];
556
557 data_tensor in(tensor_range);
558 reduced_tensor redux(full_reduced_range);
559 reduced_tensor redux_gpu(reduced_range);
560
561 in.setRandom();
562 redux.setZero();
563 array<IndexType, 2> tensor_offset_range(tensor_range);
564 tensor_offset_range[0] -= 1;
565 // Set maximum value outside of the considered range.
566 for (IndexType i = 0; i < n_reduced; i++) {
567 in(i) = static_cast<DataType>(2);
568 }
569
570 Eigen::array<IndexType, 1> red_axis;
571 red_axis[0] = 1;
572
573 const IndexType offset = 64;
574 // Introduce an offset in both the input and the output.
575 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
576 TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
577 red_offset = in_offset.maximum(red_axis);
578
579 // Check that the first value hasn't been changed and that the reduced values
580 // are not equal to the previously set maximum in the input outside the range.
581 VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
582 for (IndexType i = 0; i < n_reduced; i++) {
583 VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
584 }
585
586 DataType* gpu_in_data =
587 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
588 DataType* gpu_out_data = static_cast<DataType*>(
589 sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
590
591 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
592 TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
593 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
594 n_elems * sizeof(DataType));
595 out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
596 sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
597 n_reduced * sizeof(DataType));
598
599 // Check that the CPU and GPU reductions return the same result.
600 for (IndexType i = 0; i < n_reduced; i++) {
601 VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
602 }
603
604 sycl_device.deallocate(gpu_in_data);
605 sycl_device.deallocate(gpu_out_data);
606 }
607
608 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device,IndexType dim_x,IndexType dim_y)609 static void test_first_dim_reductions_sum_sycl(
610 const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
611 array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
612 Eigen::array<IndexType, 1> red_axis;
613 red_axis[0] = 0;
614 array<IndexType, 1> reduced_tensorRange = {{dim_y}};
615
616 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
617 Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
618 Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
619
620 in.setRandom();
621 redux = in.sum(red_axis);
622
623 DataType* gpu_in_data = static_cast<DataType*>(
624 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
625 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
626 redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
627
628 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
629 tensorRange);
630 TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
631 gpu_out_data, reduced_tensorRange);
632
633 sycl_device.memcpyHostToDevice(
634 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
635 out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
636 sycl_device.memcpyDeviceToHost(
637 redux_gpu.data(), gpu_out_data,
638 redux_gpu.dimensions().TotalSize() * sizeof(DataType));
639
640 // Check that the CPU and GPU reductions return the same result.
641 for (IndexType i = 0; i < redux.size(); i++) {
642 VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
643 }
644 sycl_device.deallocate(gpu_in_data);
645 sycl_device.deallocate(gpu_out_data);
646 }
647
648 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)649 static void test_first_dim_reductions_mean_sycl(
650 const Eigen::SyclDevice& sycl_device) {
651 IndexType dim_x = 145;
652 IndexType dim_y = 1;
653 IndexType dim_z = 67;
654
655 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
656 Eigen::array<IndexType, 1> red_axis;
657 red_axis[0] = 0;
658 array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
659
660 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
661 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
662 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
663
664 in.setRandom();
665
666 redux = in.mean(red_axis);
667
668 DataType* gpu_in_data = static_cast<DataType*>(
669 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
670 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
671 redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
672
673 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
674 tensorRange);
675 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
676 gpu_out_data, reduced_tensorRange);
677
678 sycl_device.memcpyHostToDevice(
679 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
680 out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
681 sycl_device.memcpyDeviceToHost(
682 redux_gpu.data(), gpu_out_data,
683 redux_gpu.dimensions().TotalSize() * sizeof(DataType));
684
685 // Check that the CPU and GPU reductions return the same result.
686 for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
687 for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
688 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
689
690 sycl_device.deallocate(gpu_in_data);
691 sycl_device.deallocate(gpu_out_data);
692 }
693
694 template <typename DataType, int DataLayout, typename IndexType>
test_last_dim_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)695 static void test_last_dim_reductions_mean_sycl(
696 const Eigen::SyclDevice& sycl_device) {
697 IndexType dim_x = 64;
698 IndexType dim_y = 1;
699 IndexType dim_z = 32;
700
701 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
702 Eigen::array<IndexType, 1> red_axis;
703 red_axis[0] = 2;
704 array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
705
706 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
707 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
708 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
709
710 in.setRandom();
711
712 redux = in.mean(red_axis);
713
714 DataType* gpu_in_data = static_cast<DataType*>(
715 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
716 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
717 redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
718
719 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
720 tensorRange);
721 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
722 gpu_out_data, reduced_tensorRange);
723
724 sycl_device.memcpyHostToDevice(
725 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
726 out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
727 sycl_device.memcpyDeviceToHost(
728 redux_gpu.data(), gpu_out_data,
729 redux_gpu.dimensions().TotalSize() * sizeof(DataType));
730 // Check that the CPU and GPU reductions return the same result.
731 for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
732 for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
733 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
734
735 sycl_device.deallocate(gpu_in_data);
736 sycl_device.deallocate(gpu_out_data);
737 }
738
739 template <typename DataType, int DataLayout, typename IndexType>
test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device)740 static void test_last_dim_reductions_sum_sycl(
741 const Eigen::SyclDevice& sycl_device) {
742 IndexType dim_x = 64;
743 IndexType dim_y = 1;
744 IndexType dim_z = 32;
745
746 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
747 Eigen::array<IndexType, 1> red_axis;
748 red_axis[0] = 2;
749 array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
750
751 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
752 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
753 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
754
755 in.setRandom();
756
757 redux = in.sum(red_axis);
758
759 DataType* gpu_in_data = static_cast<DataType*>(
760 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
761 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
762 redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
763
764 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
765 tensorRange);
766 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
767 gpu_out_data, reduced_tensorRange);
768
769 sycl_device.memcpyHostToDevice(
770 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
771 out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
772 sycl_device.memcpyDeviceToHost(
773 redux_gpu.data(), gpu_out_data,
774 redux_gpu.dimensions().TotalSize() * sizeof(DataType));
775 // Check that the CPU and GPU reductions return the same result.
776 for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
777 for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
778 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
779
780 sycl_device.deallocate(gpu_in_data);
781 sycl_device.deallocate(gpu_out_data);
782 }
783
784 template <typename DataType, int DataLayout, typename IndexType>
test_last_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device)785 static void test_last_reductions_sum_sycl(
786 const Eigen::SyclDevice& sycl_device) {
787 auto tensorRange = Sizes<64, 32>(64, 32);
788 // auto red_axis = Sizes<0,1>(0,1);
789 Eigen::IndexList<Eigen::type2index<1>> red_axis;
790 auto reduced_tensorRange = Sizes<64>(64);
791 TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
792 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
793 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
794
795 in_fix.setRandom();
796
797 redux_fix = in_fix.sum(red_axis);
798
799 DataType* gpu_in_data = static_cast<DataType*>(
800 sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
801 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
802 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
803
804 TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
805 gpu_in_data, tensorRange);
806 TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
807 gpu_out_data, reduced_tensorRange);
808
809 sycl_device.memcpyHostToDevice(
810 gpu_in_data, in_fix.data(),
811 (in_fix.dimensions().TotalSize()) * sizeof(DataType));
812 out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
813 sycl_device.memcpyDeviceToHost(
814 redux_gpu_fix.data(), gpu_out_data,
815 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
816 // Check that the CPU and GPU reductions return the same result.
817 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
818 VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
819 }
820
821 sycl_device.deallocate(gpu_in_data);
822 sycl_device.deallocate(gpu_out_data);
823 }
824
825 template <typename DataType, int DataLayout, typename IndexType>
test_last_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)826 static void test_last_reductions_mean_sycl(
827 const Eigen::SyclDevice& sycl_device) {
828 auto tensorRange = Sizes<64, 32>(64, 32);
829 Eigen::IndexList<Eigen::type2index<1>> red_axis;
830 auto reduced_tensorRange = Sizes<64>(64);
831 TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
832 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
833 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
834
835 in_fix.setRandom();
836 redux_fix = in_fix.mean(red_axis);
837
838 DataType* gpu_in_data = static_cast<DataType*>(
839 sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
840 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
841 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
842
843 TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
844 gpu_in_data, tensorRange);
845 TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
846 gpu_out_data, reduced_tensorRange);
847
848 sycl_device.memcpyHostToDevice(
849 gpu_in_data, in_fix.data(),
850 (in_fix.dimensions().TotalSize()) * sizeof(DataType));
851 out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
852 sycl_device.memcpyDeviceToHost(
853 redux_gpu_fix.data(), gpu_out_data,
854 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
855 sycl_device.synchronize();
856 // Check that the CPU and GPU reductions return the same result.
857 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
858 VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
859 }
860
861 sycl_device.deallocate(gpu_in_data);
862 sycl_device.deallocate(gpu_out_data);
863 }
864
865 // SYCL supports a generic case of reduction where the accumulator is a
866 // different type than the input data This is an example on how to get if a
867 // Tensor contains nan and/or inf in one reduction
868 template <typename InT, typename OutT>
869 struct CustomReducer {
870 static const bool PacketAccess = false;
871 static const bool IsStateful = false;
872
873 static constexpr OutT InfBit = 1;
874 static constexpr OutT NanBit = 2;
875
reduceCustomReducer876 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
877 OutT* accum) const {
878 if (Eigen::numext::isinf(x))
879 *accum |= InfBit;
880 else if (Eigen::numext::isnan(x))
881 *accum |= NanBit;
882 }
883
reduceCustomReducer884 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
885 OutT* accum) const {
886 *accum |= x;
887 }
888
initializeCustomReducer889 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
890 return OutT(0);
891 }
892
finalizeCustomReducer893 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
894 return accum;
895 }
896 };
897
898 template <typename DataType, typename AccumType, int DataLayout,
899 typename IndexType>
test_full_reductions_custom_sycl(const Eigen::SyclDevice & sycl_device)900 static void test_full_reductions_custom_sycl(
901 const Eigen::SyclDevice& sycl_device) {
902 constexpr IndexType InSize = 64;
903 auto tensorRange = Sizes<InSize>(InSize);
904 Eigen::IndexList<Eigen::type2index<0>> dims;
905 auto reduced_tensorRange = Sizes<>();
906 TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
907 TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
908
909 CustomReducer<DataType, AccumType> reducer;
910
911 in_fix.setRandom();
912
913 size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
914 DataType* gpu_in_data =
915 static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
916 AccumType* gpu_out_data =
917 static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
918
919 TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
920 gpu_in_data, tensorRange);
921 TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
922 gpu_out_data, reduced_tensorRange);
923
924 sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
925 out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
926 sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
927 sizeof(AccumType));
928 VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
929
930 sycl_device.deallocate(gpu_in_data);
931 sycl_device.deallocate(gpu_out_data);
932 }
933
934 template <typename DataType, typename Dev>
sycl_reduction_test_full_per_device(const Dev & sycl_device)935 void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
936 test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
937 test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
938 test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
939 test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
940 test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
941 test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
942
943 test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
944 test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
945 test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
946 sycl_device);
947 test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
948 sycl_device);
949 sycl_device.synchronize();
950 }
951
952 template <typename DataType, typename Dev>
sycl_reduction_full_offset_per_device(const Dev & sycl_device)953 void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
954 test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
955 sycl_device);
956 test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
957 sycl_device);
958 test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
959 sycl_device);
960 test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
961 sycl_device);
962 test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
963 sycl_device);
964 test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
965 sycl_device);
966 test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
967 sycl_device);
968 test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
969 sycl_device);
970 test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
971 sycl_device);
972 sycl_device.synchronize();
973 }
974
975 template <typename DataType, typename Dev>
sycl_reduction_test_first_dim_per_device(const Dev & sycl_device)976 void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
977 test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
978 4197, 4097);
979 test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
980 4197, 4097);
981 test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
982 129, 8);
983 test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
984 test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
985 sycl_device);
986 sycl_device.synchronize();
987 }
988
989 template <typename DataType, typename Dev>
sycl_reduction_test_last_dim_per_device(const Dev & sycl_device)990 void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
991 test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
992 test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
993 sycl_device);
994 test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
995 test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
996 test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
997 test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
998 sycl_device.synchronize();
999 }
1000
EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl)1001 EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
1002 for (const auto& device : Eigen::get_sycl_supported_devices()) {
1003 std::cout << "Running on "
1004 << device.template get_info<cl::sycl::info::device::name>()
1005 << std::endl;
1006 QueueInterface queueInterface(device);
1007 auto sycl_device = Eigen::SyclDevice(&queueInterface);
1008 CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
1009 CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
1010 CALL_SUBTEST_3(
1011 sycl_reduction_test_first_dim_per_device<float>(sycl_device));
1012 CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
1013 }
1014 }
1015