• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015
5 // Mehdi Goli    Codeplay Software Ltd.
6 // Ralph Potter  Codeplay Software Ltd.
7 // Luke Iwanski  Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 
14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
16 
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
19 #define EIGEN_HAS_CONSTEXPR 1
20 
21 #include "main.h"
22 
23 #include <unsupported/Eigen/CXX11/Tensor>
24 
25 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device)26 static void test_full_reductions_sum_sycl(
27     const Eigen::SyclDevice& sycl_device) {
28   const IndexType num_rows = 753;
29   const IndexType num_cols = 537;
30   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
31 
32   array<IndexType, 2> outRange = {{1, 1}};
33 
34   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
35   Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
36   Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
37 
38   in.setRandom();
39   auto dim = DSizes<IndexType, 2>(1, 1);
40   full_redux = in.sum().reshape(dim);
41 
42   DataType* gpu_in_data = static_cast<DataType*>(
43       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
44   DataType* gpu_out_data = (DataType*)sycl_device.allocate(
45       sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
46 
47   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
48                                                                tensorRange);
49   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
50                                                                 outRange);
51   sycl_device.memcpyHostToDevice(
52       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
53   out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
54   sycl_device.memcpyDeviceToHost(
55       full_redux_gpu.data(), gpu_out_data,
56       (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
57   // Check that the CPU and GPU reductions return the same result.
58   std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
59             << ", CPU FULL: " << full_redux(0, 0) << "\n";
60   VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
61   sycl_device.deallocate(gpu_in_data);
62   sycl_device.deallocate(gpu_out_data);
63 }
64 
65 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_sum_with_offset_sycl(const Eigen::SyclDevice & sycl_device)66 static void test_full_reductions_sum_with_offset_sycl(
67     const Eigen::SyclDevice& sycl_device) {
68   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
69   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
70   const IndexType num_rows = 64;
71   const IndexType num_cols = 64;
72   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
73   const IndexType n_elems = internal::array_prod(tensor_range);
74 
75   data_tensor in(tensor_range);
76   scalar_tensor full_redux;
77   scalar_tensor full_redux_gpu;
78 
79   in.setRandom();
80   array<IndexType, 2> tensor_offset_range(tensor_range);
81   tensor_offset_range[0] -= 1;
82 
83   const IndexType offset = 64;
84   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
85   full_redux = in_offset.sum();
86 
87   DataType* gpu_in_data =
88       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
89   DataType* gpu_out_data =
90       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
91 
92   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
93   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
94   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
95                                  n_elems * sizeof(DataType));
96   out_gpu.device(sycl_device) = in_gpu.sum();
97   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
98                                  sizeof(DataType));
99 
100   // Check that the CPU and GPU reductions return the same result.
101   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
102 
103   sycl_device.deallocate(gpu_in_data);
104   sycl_device.deallocate(gpu_out_data);
105 }
106 
107 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_max_sycl(const Eigen::SyclDevice & sycl_device)108 static void test_full_reductions_max_sycl(
109     const Eigen::SyclDevice& sycl_device) {
110   const IndexType num_rows = 4096;
111   const IndexType num_cols = 4096;
112   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
113 
114   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
115   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
116   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
117 
118   in.setRandom();
119 
120   full_redux = in.maximum();
121 
122   DataType* gpu_in_data = static_cast<DataType*>(
123       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
124   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
125 
126   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
127                                                                tensorRange);
128   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
129   sycl_device.memcpyHostToDevice(
130       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
131   out_gpu.device(sycl_device) = in_gpu.maximum();
132   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
133                                  sizeof(DataType));
134   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
135   sycl_device.deallocate(gpu_in_data);
136   sycl_device.deallocate(gpu_out_data);
137 }
138 
139 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_max_with_offset_sycl(const Eigen::SyclDevice & sycl_device)140 static void test_full_reductions_max_with_offset_sycl(
141     const Eigen::SyclDevice& sycl_device) {
142   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
143   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
144   const IndexType num_rows = 64;
145   const IndexType num_cols = 64;
146   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
147   const IndexType n_elems = internal::array_prod(tensor_range);
148 
149   data_tensor in(tensor_range);
150   scalar_tensor full_redux;
151   scalar_tensor full_redux_gpu;
152 
153   in.setRandom();
154   array<IndexType, 2> tensor_offset_range(tensor_range);
155   tensor_offset_range[0] -= 1;
156   // Set the initial value to be the max.
157   // As we don't include this in the reduction the result should not be 2.
158   in(0) = static_cast<DataType>(2);
159 
160   const IndexType offset = 64;
161   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
162   full_redux = in_offset.maximum();
163   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
164 
165   DataType* gpu_in_data =
166       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
167   DataType* gpu_out_data =
168       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
169 
170   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
171   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
172   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
173                                  n_elems * sizeof(DataType));
174   out_gpu.device(sycl_device) = in_gpu.maximum();
175   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
176                                  sizeof(DataType));
177 
178   // Check that the CPU and GPU reductions return the same result.
179   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
180 
181   sycl_device.deallocate(gpu_in_data);
182   sycl_device.deallocate(gpu_out_data);
183 }
184 
185 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)186 static void test_full_reductions_mean_sycl(
187     const Eigen::SyclDevice& sycl_device) {
188   const IndexType num_rows = 4096;
189   const IndexType num_cols = 4096;
190   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
191   array<IndexType, 1> argRange = {{num_cols}};
192   Eigen::array<IndexType, 1> red_axis;
193   red_axis[0] = 0;
194   //  red_axis[1]=1;
195   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
196   Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
197   Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
198   Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
199   Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
200   Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
201   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
202   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
203 
204   in.setRandom();
205   in_arg1.setRandom();
206   in_arg2.setRandom();
207 
208   DataType* gpu_in_data = static_cast<DataType*>(
209       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
210   DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
211       in_arg1.dimensions().TotalSize() * sizeof(DataType)));
212   DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
213       in_arg2.dimensions().TotalSize() * sizeof(DataType)));
214   bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
215       out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
216   bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
217       out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
218 
219   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
220 
221   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
222                                                                tensorRange);
223   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
224       gpu_in_arg1_data, tensorRange);
225   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
226       gpu_in_arg2_data, tensorRange);
227   TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
228       gpu_out_arg_data, argRange);
229   TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
230       gpu_out_arg__gpu_helper_data, argRange);
231   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
232 
233   // CPU VERSION
234   out_arg_cpu =
235       (in_arg1.argmax(1) == in_arg2.argmax(1))
236           .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
237   full_redux = (out_arg_cpu.template cast<float>())
238                    .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
239 
240   // GPU VERSION
241   sycl_device.memcpyHostToDevice(
242       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
243   sycl_device.memcpyHostToDevice(
244       gpu_in_arg1_data, in_arg1.data(),
245       (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
246   sycl_device.memcpyHostToDevice(
247       gpu_in_arg2_data, in_arg2.data(),
248       (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
249   out_Argout_gpu_helper.device(sycl_device) =
250       (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
251   out_Argout_gpu.device(sycl_device) =
252       (out_Argout_gpu_helper)
253           .select(out_Argout_gpu.constant(true),
254                   out_Argout_gpu.constant(false));
255   out_gpu.device(sycl_device) =
256       (out_Argout_gpu.template cast<float>())
257           .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
258   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
259                                  sizeof(DataType));
260   // Check that the CPU and GPU reductions return the same result.
261   std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
262             << '\n';
263   VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
264   sycl_device.deallocate(gpu_in_data);
265   sycl_device.deallocate(gpu_in_arg1_data);
266   sycl_device.deallocate(gpu_in_arg2_data);
267   sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
268   sycl_device.deallocate(gpu_out_arg_data);
269   sycl_device.deallocate(gpu_out_data);
270 }
271 
272 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_mean_with_offset_sycl(const Eigen::SyclDevice & sycl_device)273 static void test_full_reductions_mean_with_offset_sycl(
274     const Eigen::SyclDevice& sycl_device) {
275   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
276   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
277   const IndexType num_rows = 64;
278   const IndexType num_cols = 64;
279   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
280   const IndexType n_elems = internal::array_prod(tensor_range);
281 
282   data_tensor in(tensor_range);
283   scalar_tensor full_redux;
284   scalar_tensor full_redux_gpu;
285 
286   in.setRandom();
287   array<IndexType, 2> tensor_offset_range(tensor_range);
288   tensor_offset_range[0] -= 1;
289 
290   const IndexType offset = 64;
291   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
292   full_redux = in_offset.mean();
293   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
294 
295   DataType* gpu_in_data =
296       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
297   DataType* gpu_out_data =
298       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
299 
300   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
301   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
302   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
303                                  n_elems * sizeof(DataType));
304   out_gpu.device(sycl_device) = in_gpu.mean();
305   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
306                                  sizeof(DataType));
307 
308   // Check that the CPU and GPU reductions return the same result.
309   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
310 
311   sycl_device.deallocate(gpu_in_data);
312   sycl_device.deallocate(gpu_out_data);
313 }
314 
315 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_mean_with_odd_offset_sycl(const Eigen::SyclDevice & sycl_device)316 static void test_full_reductions_mean_with_odd_offset_sycl(
317     const Eigen::SyclDevice& sycl_device) {
318   // This is a particular case which illustrates a possible problem when the
319   // number of local threads in a workgroup is even, but is not a power of two.
320   using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
321   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
322   // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
323   // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
324   const IndexType n_elems = 8707;
325   array<IndexType, 1> tensor_range = {{n_elems}};
326 
327   data_tensor in(tensor_range);
328   DataType full_redux;
329   DataType full_redux_gpu;
330   TensorMap<scalar_tensor> red_cpu(&full_redux);
331   TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
332 
333   const DataType const_val = static_cast<DataType>(0.6391);
334   in = in.constant(const_val);
335 
336   Eigen::IndexList<Eigen::type2index<0>> red_axis;
337   red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
338   VERIFY_IS_APPROX(const_val, red_cpu());
339 
340   DataType* gpu_in_data =
341       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
342   DataType* gpu_out_data =
343       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
344 
345   TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
346   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
347   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
348                                  n_elems * sizeof(DataType));
349   out_gpu.device(sycl_device) =
350       in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
351   sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
352                                  sizeof(DataType));
353 
354   // Check that the CPU and GPU reductions return the same result.
355   VERIFY_IS_APPROX(full_redux_gpu, full_redux);
356 
357   sycl_device.deallocate(gpu_in_data);
358   sycl_device.deallocate(gpu_out_data);
359 }
360 
361 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_min_sycl(const Eigen::SyclDevice & sycl_device)362 static void test_full_reductions_min_sycl(
363     const Eigen::SyclDevice& sycl_device) {
364   const IndexType num_rows = 876;
365   const IndexType num_cols = 953;
366   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
367 
368   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
369   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
370   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
371 
372   in.setRandom();
373 
374   full_redux = in.minimum();
375 
376   DataType* gpu_in_data = static_cast<DataType*>(
377       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
378   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
379 
380   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
381                                                                tensorRange);
382   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
383 
384   sycl_device.memcpyHostToDevice(
385       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
386   out_gpu.device(sycl_device) = in_gpu.minimum();
387   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
388                                  sizeof(DataType));
389   // Check that the CPU and GPU reductions return the same result.
390   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
391   sycl_device.deallocate(gpu_in_data);
392   sycl_device.deallocate(gpu_out_data);
393 }
394 
395 template <typename DataType, int DataLayout, typename IndexType>
test_full_reductions_min_with_offset_sycl(const Eigen::SyclDevice & sycl_device)396 static void test_full_reductions_min_with_offset_sycl(
397     const Eigen::SyclDevice& sycl_device) {
398   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
399   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
400   const IndexType num_rows = 64;
401   const IndexType num_cols = 64;
402   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
403   const IndexType n_elems = internal::array_prod(tensor_range);
404 
405   data_tensor in(tensor_range);
406   scalar_tensor full_redux;
407   scalar_tensor full_redux_gpu;
408 
409   in.setRandom();
410   array<IndexType, 2> tensor_offset_range(tensor_range);
411   tensor_offset_range[0] -= 1;
412   // Set the initial value to be the min.
413   // As we don't include this in the reduction the result should not be -2.
414   in(0) = static_cast<DataType>(-2);
415 
416   const IndexType offset = 64;
417   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
418   full_redux = in_offset.minimum();
419   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
420 
421   DataType* gpu_in_data =
422       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
423   DataType* gpu_out_data =
424       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
425 
426   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
427   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
428   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
429                                  n_elems * sizeof(DataType));
430   out_gpu.device(sycl_device) = in_gpu.minimum();
431   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
432                                  sizeof(DataType));
433 
434   // Check that the CPU and GPU reductions return the same result.
435   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
436 
437   sycl_device.deallocate(gpu_in_data);
438   sycl_device.deallocate(gpu_out_data);
439 }
440 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_max_sycl(const Eigen::SyclDevice & sycl_device)441 static void test_first_dim_reductions_max_sycl(
442     const Eigen::SyclDevice& sycl_device) {
443   IndexType dim_x = 145;
444   IndexType dim_y = 1;
445   IndexType dim_z = 67;
446 
447   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
448   Eigen::array<IndexType, 1> red_axis;
449   red_axis[0] = 0;
450   array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
451 
452   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
453   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
454   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
455 
456   in.setRandom();
457 
458   redux = in.maximum(red_axis);
459 
460   DataType* gpu_in_data = static_cast<DataType*>(
461       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
462   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
463       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
464 
465   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
466                                                                tensorRange);
467   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
468       gpu_out_data, reduced_tensorRange);
469 
470   sycl_device.memcpyHostToDevice(
471       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
472   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
473   sycl_device.memcpyDeviceToHost(
474       redux_gpu.data(), gpu_out_data,
475       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
476 
477   // Check that the CPU and GPU reductions return the same result.
478   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
479     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
480       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
481 
482   sycl_device.deallocate(gpu_in_data);
483   sycl_device.deallocate(gpu_out_data);
484 }
485 
486 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_max_with_offset_sycl(const Eigen::SyclDevice & sycl_device)487 static void test_first_dim_reductions_max_with_offset_sycl(
488     const Eigen::SyclDevice& sycl_device) {
489   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
490   using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
491 
492   const IndexType num_rows = 64;
493   const IndexType num_cols = 64;
494   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
495   array<IndexType, 1> reduced_range = {{num_cols}};
496   const IndexType n_elems = internal::array_prod(tensor_range);
497   const IndexType n_reduced = num_cols;
498 
499   data_tensor in(tensor_range);
500   reduced_tensor redux;
501   reduced_tensor redux_gpu(reduced_range);
502 
503   in.setRandom();
504   array<IndexType, 2> tensor_offset_range(tensor_range);
505   tensor_offset_range[0] -= 1;
506   // Set maximum value outside of the considered range.
507   for (IndexType i = 0; i < n_reduced; i++) {
508     in(i) = static_cast<DataType>(2);
509   }
510 
511   Eigen::array<IndexType, 1> red_axis;
512   red_axis[0] = 0;
513 
514   const IndexType offset = 64;
515   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
516   redux = in_offset.maximum(red_axis);
517   for (IndexType i = 0; i < n_reduced; i++) {
518     VERIFY_IS_NOT_EQUAL(redux(i), in(i));
519   }
520 
521   DataType* gpu_in_data =
522       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
523   DataType* gpu_out_data = static_cast<DataType*>(
524       sycl_device.allocate(n_reduced * sizeof(DataType)));
525 
526   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
527   TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
528   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
529                                  n_elems * sizeof(DataType));
530   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
531   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
532                                  n_reduced * sizeof(DataType));
533 
534   // Check that the CPU and GPU reductions return the same result.
535   for (IndexType i = 0; i < n_reduced; i++) {
536     VERIFY_IS_APPROX(redux_gpu(i), redux(i));
537   }
538 
539   sycl_device.deallocate(gpu_in_data);
540   sycl_device.deallocate(gpu_out_data);
541 }
542 
543 template <typename DataType, int DataLayout, typename IndexType>
test_last_dim_reductions_max_with_offset_sycl(const Eigen::SyclDevice & sycl_device)544 static void test_last_dim_reductions_max_with_offset_sycl(
545     const Eigen::SyclDevice& sycl_device) {
546   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
547   using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
548 
549   const IndexType num_rows = 64;
550   const IndexType num_cols = 64;
551   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
552   array<IndexType, 1> full_reduced_range = {{num_rows}};
553   array<IndexType, 1> reduced_range = {{num_rows - 1}};
554   const IndexType n_elems = internal::array_prod(tensor_range);
555   const IndexType n_reduced = reduced_range[0];
556 
557   data_tensor in(tensor_range);
558   reduced_tensor redux(full_reduced_range);
559   reduced_tensor redux_gpu(reduced_range);
560 
561   in.setRandom();
562   redux.setZero();
563   array<IndexType, 2> tensor_offset_range(tensor_range);
564   tensor_offset_range[0] -= 1;
565   // Set maximum value outside of the considered range.
566   for (IndexType i = 0; i < n_reduced; i++) {
567     in(i) = static_cast<DataType>(2);
568   }
569 
570   Eigen::array<IndexType, 1> red_axis;
571   red_axis[0] = 1;
572 
573   const IndexType offset = 64;
574   // Introduce an offset in both the input and the output.
575   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
576   TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
577   red_offset = in_offset.maximum(red_axis);
578 
579   // Check that the first value hasn't been changed and that the reduced values
580   // are not equal to the previously set maximum in the input outside the range.
581   VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
582   for (IndexType i = 0; i < n_reduced; i++) {
583     VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
584   }
585 
586   DataType* gpu_in_data =
587       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
588   DataType* gpu_out_data = static_cast<DataType*>(
589       sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
590 
591   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
592   TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
593   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
594                                  n_elems * sizeof(DataType));
595   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
596   sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
597                                  n_reduced * sizeof(DataType));
598 
599   // Check that the CPU and GPU reductions return the same result.
600   for (IndexType i = 0; i < n_reduced; i++) {
601     VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
602   }
603 
604   sycl_device.deallocate(gpu_in_data);
605   sycl_device.deallocate(gpu_out_data);
606 }
607 
608 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device,IndexType dim_x,IndexType dim_y)609 static void test_first_dim_reductions_sum_sycl(
610     const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
611   array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
612   Eigen::array<IndexType, 1> red_axis;
613   red_axis[0] = 0;
614   array<IndexType, 1> reduced_tensorRange = {{dim_y}};
615 
616   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
617   Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
618   Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
619 
620   in.setRandom();
621   redux = in.sum(red_axis);
622 
623   DataType* gpu_in_data = static_cast<DataType*>(
624       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
625   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
626       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
627 
628   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
629                                                                tensorRange);
630   TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
631       gpu_out_data, reduced_tensorRange);
632 
633   sycl_device.memcpyHostToDevice(
634       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
635   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
636   sycl_device.memcpyDeviceToHost(
637       redux_gpu.data(), gpu_out_data,
638       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
639 
640   // Check that the CPU and GPU reductions return the same result.
641   for (IndexType i = 0; i < redux.size(); i++) {
642     VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
643   }
644   sycl_device.deallocate(gpu_in_data);
645   sycl_device.deallocate(gpu_out_data);
646 }
647 
648 template <typename DataType, int DataLayout, typename IndexType>
test_first_dim_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)649 static void test_first_dim_reductions_mean_sycl(
650     const Eigen::SyclDevice& sycl_device) {
651   IndexType dim_x = 145;
652   IndexType dim_y = 1;
653   IndexType dim_z = 67;
654 
655   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
656   Eigen::array<IndexType, 1> red_axis;
657   red_axis[0] = 0;
658   array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
659 
660   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
661   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
662   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
663 
664   in.setRandom();
665 
666   redux = in.mean(red_axis);
667 
668   DataType* gpu_in_data = static_cast<DataType*>(
669       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
670   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
671       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
672 
673   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
674                                                                tensorRange);
675   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
676       gpu_out_data, reduced_tensorRange);
677 
678   sycl_device.memcpyHostToDevice(
679       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
680   out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
681   sycl_device.memcpyDeviceToHost(
682       redux_gpu.data(), gpu_out_data,
683       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
684 
685   // Check that the CPU and GPU reductions return the same result.
686   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
687     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
688       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
689 
690   sycl_device.deallocate(gpu_in_data);
691   sycl_device.deallocate(gpu_out_data);
692 }
693 
694 template <typename DataType, int DataLayout, typename IndexType>
test_last_dim_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)695 static void test_last_dim_reductions_mean_sycl(
696     const Eigen::SyclDevice& sycl_device) {
697   IndexType dim_x = 64;
698   IndexType dim_y = 1;
699   IndexType dim_z = 32;
700 
701   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
702   Eigen::array<IndexType, 1> red_axis;
703   red_axis[0] = 2;
704   array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
705 
706   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
707   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
708   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
709 
710   in.setRandom();
711 
712   redux = in.mean(red_axis);
713 
714   DataType* gpu_in_data = static_cast<DataType*>(
715       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
716   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
717       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
718 
719   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
720                                                                tensorRange);
721   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
722       gpu_out_data, reduced_tensorRange);
723 
724   sycl_device.memcpyHostToDevice(
725       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
726   out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
727   sycl_device.memcpyDeviceToHost(
728       redux_gpu.data(), gpu_out_data,
729       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
730   // Check that the CPU and GPU reductions return the same result.
731   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
732     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
733       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
734 
735   sycl_device.deallocate(gpu_in_data);
736   sycl_device.deallocate(gpu_out_data);
737 }
738 
739 template <typename DataType, int DataLayout, typename IndexType>
test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device)740 static void test_last_dim_reductions_sum_sycl(
741     const Eigen::SyclDevice& sycl_device) {
742   IndexType dim_x = 64;
743   IndexType dim_y = 1;
744   IndexType dim_z = 32;
745 
746   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
747   Eigen::array<IndexType, 1> red_axis;
748   red_axis[0] = 2;
749   array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
750 
751   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
752   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
753   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
754 
755   in.setRandom();
756 
757   redux = in.sum(red_axis);
758 
759   DataType* gpu_in_data = static_cast<DataType*>(
760       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
761   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
762       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
763 
764   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
765                                                                tensorRange);
766   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
767       gpu_out_data, reduced_tensorRange);
768 
769   sycl_device.memcpyHostToDevice(
770       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
771   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
772   sycl_device.memcpyDeviceToHost(
773       redux_gpu.data(), gpu_out_data,
774       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
775   // Check that the CPU and GPU reductions return the same result.
776   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
777     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
778       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
779 
780   sycl_device.deallocate(gpu_in_data);
781   sycl_device.deallocate(gpu_out_data);
782 }
783 
784 template <typename DataType, int DataLayout, typename IndexType>
test_last_reductions_sum_sycl(const Eigen::SyclDevice & sycl_device)785 static void test_last_reductions_sum_sycl(
786     const Eigen::SyclDevice& sycl_device) {
787   auto tensorRange = Sizes<64, 32>(64, 32);
788   // auto red_axis =  Sizes<0,1>(0,1);
789   Eigen::IndexList<Eigen::type2index<1>> red_axis;
790   auto reduced_tensorRange = Sizes<64>(64);
791   TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
792   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
793   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
794 
795   in_fix.setRandom();
796 
797   redux_fix = in_fix.sum(red_axis);
798 
799   DataType* gpu_in_data = static_cast<DataType*>(
800       sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
801   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
802       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
803 
804   TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
805       gpu_in_data, tensorRange);
806   TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
807       gpu_out_data, reduced_tensorRange);
808 
809   sycl_device.memcpyHostToDevice(
810       gpu_in_data, in_fix.data(),
811       (in_fix.dimensions().TotalSize()) * sizeof(DataType));
812   out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
813   sycl_device.memcpyDeviceToHost(
814       redux_gpu_fix.data(), gpu_out_data,
815       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
816   // Check that the CPU and GPU reductions return the same result.
817   for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
818     VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
819   }
820 
821   sycl_device.deallocate(gpu_in_data);
822   sycl_device.deallocate(gpu_out_data);
823 }
824 
825 template <typename DataType, int DataLayout, typename IndexType>
test_last_reductions_mean_sycl(const Eigen::SyclDevice & sycl_device)826 static void test_last_reductions_mean_sycl(
827     const Eigen::SyclDevice& sycl_device) {
828   auto tensorRange = Sizes<64, 32>(64, 32);
829   Eigen::IndexList<Eigen::type2index<1>> red_axis;
830   auto reduced_tensorRange = Sizes<64>(64);
831   TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
832   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
833   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
834 
835   in_fix.setRandom();
836   redux_fix = in_fix.mean(red_axis);
837 
838   DataType* gpu_in_data = static_cast<DataType*>(
839       sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
840   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
841       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
842 
843   TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
844       gpu_in_data, tensorRange);
845   TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
846       gpu_out_data, reduced_tensorRange);
847 
848   sycl_device.memcpyHostToDevice(
849       gpu_in_data, in_fix.data(),
850       (in_fix.dimensions().TotalSize()) * sizeof(DataType));
851   out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
852   sycl_device.memcpyDeviceToHost(
853       redux_gpu_fix.data(), gpu_out_data,
854       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
855   sycl_device.synchronize();
856   // Check that the CPU and GPU reductions return the same result.
857   for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
858     VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
859   }
860 
861   sycl_device.deallocate(gpu_in_data);
862   sycl_device.deallocate(gpu_out_data);
863 }
864 
865 // SYCL supports a generic case of reduction where the accumulator is a
866 // different type than the input data This is an example on how to get if a
867 // Tensor contains nan and/or inf in one reduction
868 template <typename InT, typename OutT>
869 struct CustomReducer {
870   static const bool PacketAccess = false;
871   static const bool IsStateful = false;
872 
873   static constexpr OutT InfBit = 1;
874   static constexpr OutT NanBit = 2;
875 
reduceCustomReducer876   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
877                                                     OutT* accum) const {
878     if (Eigen::numext::isinf(x))
879       *accum |= InfBit;
880     else if (Eigen::numext::isnan(x))
881       *accum |= NanBit;
882   }
883 
reduceCustomReducer884   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
885                                                     OutT* accum) const {
886     *accum |= x;
887   }
888 
initializeCustomReducer889   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
890     return OutT(0);
891   }
892 
finalizeCustomReducer893   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
894     return accum;
895   }
896 };
897 
898 template <typename DataType, typename AccumType, int DataLayout,
899           typename IndexType>
test_full_reductions_custom_sycl(const Eigen::SyclDevice & sycl_device)900 static void test_full_reductions_custom_sycl(
901     const Eigen::SyclDevice& sycl_device) {
902   constexpr IndexType InSize = 64;
903   auto tensorRange = Sizes<InSize>(InSize);
904   Eigen::IndexList<Eigen::type2index<0>> dims;
905   auto reduced_tensorRange = Sizes<>();
906   TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
907   TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
908 
909   CustomReducer<DataType, AccumType> reducer;
910 
911   in_fix.setRandom();
912 
913   size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
914   DataType* gpu_in_data =
915       static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
916   AccumType* gpu_out_data =
917       static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
918 
919   TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
920       gpu_in_data, tensorRange);
921   TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
922       gpu_out_data, reduced_tensorRange);
923 
924   sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
925   out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
926   sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
927                                  sizeof(AccumType));
928   VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
929 
930   sycl_device.deallocate(gpu_in_data);
931   sycl_device.deallocate(gpu_out_data);
932 }
933 
934 template <typename DataType, typename Dev>
sycl_reduction_test_full_per_device(const Dev & sycl_device)935 void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
936   test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
937   test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
938   test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
939   test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
940   test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
941   test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
942 
943   test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
944   test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
945   test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
946       sycl_device);
947   test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
948       sycl_device);
949   sycl_device.synchronize();
950 }
951 
952 template <typename DataType, typename Dev>
sycl_reduction_full_offset_per_device(const Dev & sycl_device)953 void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
954   test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
955       sycl_device);
956   test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
957       sycl_device);
958   test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
959       sycl_device);
960   test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
961       sycl_device);
962   test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
963       sycl_device);
964   test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
965       sycl_device);
966   test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
967       sycl_device);
968   test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
969       sycl_device);
970   test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
971       sycl_device);
972   sycl_device.synchronize();
973 }
974 
975 template <typename DataType, typename Dev>
sycl_reduction_test_first_dim_per_device(const Dev & sycl_device)976 void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
977   test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
978                                                                   4197, 4097);
979   test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
980                                                                   4197, 4097);
981   test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
982                                                                   129, 8);
983   test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
984   test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
985       sycl_device);
986   sycl_device.synchronize();
987 }
988 
989 template <typename DataType, typename Dev>
sycl_reduction_test_last_dim_per_device(const Dev & sycl_device)990 void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
991   test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
992   test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
993       sycl_device);
994   test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
995   test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
996   test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
997   test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
998   sycl_device.synchronize();
999 }
1000 
EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl)1001 EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
1002   for (const auto& device : Eigen::get_sycl_supported_devices()) {
1003     std::cout << "Running on "
1004               << device.template get_info<cl::sycl::info::device::name>()
1005               << std::endl;
1006     QueueInterface queueInterface(device);
1007     auto sycl_device = Eigen::SyclDevice(&queueInterface);
1008     CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
1009     CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
1010     CALL_SUBTEST_3(
1011         sycl_reduction_test_first_dim_per_device<float>(sycl_device));
1012     CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
1013   }
1014 }
1015