1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_USE_THREADS
11
12 #include "main.h"
13
14 #include <Eigen/CXX11/Tensor>
15
16 using Eigen::Tensor;
17 using Eigen::RowMajor;
18 using Eigen::ColMajor;
19 using Eigen::internal::TiledEvaluation;
20
21 // A set of tests to verify that different TensorExecutor strategies yields the
22 // same results for all the ops, supporting tiled evaluation.
23
24 // Default assignment that does no use block evaluation or vectorization.
25 // We assume that default coefficient evaluation is well tested and correct.
26 template <typename Dst, typename Expr>
DefaultAssign(Dst & dst,Expr expr)27 static void DefaultAssign(Dst& dst, Expr expr) {
28 using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
29 using Executor =
30 Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
31 /*Vectorizable=*/false,
32 /*Tiling=*/TiledEvaluation::Off>;
33
34 Executor::run(Assign(dst, expr), DefaultDevice());
35 }
36
37 // Assignment with specified device and tiling strategy.
38 template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
39 typename Dst, typename Expr>
DeviceAssign(Device & d,Dst & dst,Expr expr)40 static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
41 using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
42 using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
43 Vectorizable, Tiling>;
44
45 Executor::run(Assign(dst, expr), d);
46 }
47
48 template <int NumDims>
RandomDims(int min_dim=1,int max_dim=20)49 static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
50 array<Index, NumDims> dims;
51 for (int i = 0; i < NumDims; ++i) {
52 dims[i] = internal::random<int>(min_dim, max_dim);
53 }
54 return dims;
55 }
56
57 template <typename T, int NumDims, typename Device, bool Vectorizable,
58 TiledEvaluation Tiling, int Layout>
test_execute_unary_expr(Device d)59 static void test_execute_unary_expr(Device d)
60 {
61 static constexpr int Options = 0 | Layout;
62
63 // Pick a large enough tensor size to bypass small tensor block evaluation
64 // optimization.
65 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
66
67 Tensor<T, NumDims, Options, Index> src(dims);
68 Tensor<T, NumDims, Options, Index> dst(dims);
69
70 src.setRandom();
71 const auto expr = src.square();
72
73 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
74 using Executor =
75 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
76
77 Executor::run(Assign(dst, expr), d);
78
79 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
80 T square = src.coeff(i) * src.coeff(i);
81 VERIFY_IS_EQUAL(square, dst.coeff(i));
82 }
83 }
84
85 template <typename T, int NumDims, typename Device, bool Vectorizable,
86 TiledEvaluation Tiling, int Layout>
test_execute_binary_expr(Device d)87 static void test_execute_binary_expr(Device d)
88 {
89 static constexpr int Options = 0 | Layout;
90
91 // Pick a large enough tensor size to bypass small tensor block evaluation
92 // optimization.
93 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
94
95 Tensor<T, NumDims, Options, Index> lhs(dims);
96 Tensor<T, NumDims, Options, Index> rhs(dims);
97 Tensor<T, NumDims, Options, Index> dst(dims);
98
99 lhs.setRandom();
100 rhs.setRandom();
101
102 const auto expr = lhs + rhs;
103
104 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
105 using Executor =
106 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
107
108 Executor::run(Assign(dst, expr), d);
109
110 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
111 T sum = lhs.coeff(i) + rhs.coeff(i);
112 VERIFY_IS_EQUAL(sum, dst.coeff(i));
113 }
114 }
115
116 template <typename T, int NumDims, typename Device, bool Vectorizable,
117 TiledEvaluation Tiling, int Layout>
test_execute_broadcasting(Device d)118 static void test_execute_broadcasting(Device d)
119 {
120 static constexpr int Options = 0 | Layout;
121
122 auto dims = RandomDims<NumDims>(1, 10);
123 Tensor<T, NumDims, Options, Index> src(dims);
124 src.setRandom();
125
126 const auto broadcasts = RandomDims<NumDims>(1, 7);
127 const auto expr = src.broadcast(broadcasts);
128
129 // We assume that broadcasting on a default device is tested and correct, so
130 // we can rely on it to verify correctness of tensor executor and tiling.
131 Tensor<T, NumDims, Options, Index> golden;
132 golden = expr;
133
134 // Now do the broadcasting using configured tensor executor.
135 Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
136
137 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
138 using Executor =
139 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
140
141 Executor::run(Assign(dst, expr), d);
142
143 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
144 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
145 }
146 }
147
148 template <typename T, int NumDims, typename Device, bool Vectorizable,
149 TiledEvaluation Tiling, int Layout>
test_execute_chipping_rvalue(Device d)150 static void test_execute_chipping_rvalue(Device d)
151 {
152 auto dims = RandomDims<NumDims>(1, 10);
153 Tensor<T, NumDims, Layout, Index> src(dims);
154 src.setRandom();
155
156 #define TEST_CHIPPING(CHIP_DIM) \
157 if (NumDims > (CHIP_DIM)) { \
158 const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
159 const auto expr = src.template chip<(CHIP_DIM)>(offset); \
160 \
161 Tensor<T, NumDims - 1, Layout, Index> golden; \
162 golden = expr; \
163 \
164 Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \
165 \
166 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
167 using Executor = internal::TensorExecutor<const Assign, Device, \
168 Vectorizable, Tiling>; \
169 \
170 Executor::run(Assign(dst, expr), d); \
171 \
172 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
173 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
174 } \
175 }
176
177 TEST_CHIPPING(0)
178 TEST_CHIPPING(1)
179 TEST_CHIPPING(2)
180 TEST_CHIPPING(3)
181 TEST_CHIPPING(4)
182 TEST_CHIPPING(5)
183
184 #undef TEST_CHIPPING
185 }
186
187 template <typename T, int NumDims, typename Device, bool Vectorizable,
188 TiledEvaluation Tiling, int Layout>
test_execute_chipping_lvalue(Device d)189 static void test_execute_chipping_lvalue(Device d)
190 {
191 auto dims = RandomDims<NumDims>(1, 10);
192
193 #define TEST_CHIPPING(CHIP_DIM) \
194 if (NumDims > (CHIP_DIM)) { \
195 /* Generate random data that we'll assign to the chipped tensor dim. */ \
196 array<Index, NumDims - 1> src_dims; \
197 for (int i = 0; i < NumDims - 1; ++i) { \
198 int dim = i < (CHIP_DIM) ? i : i + 1; \
199 src_dims[i] = dims[dim]; \
200 } \
201 \
202 Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \
203 src.setRandom(); \
204 \
205 const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
206 \
207 Tensor<T, NumDims, Layout, Index> random(dims); \
208 random.setZero(); \
209 \
210 Tensor<T, NumDims, Layout, Index> golden(dims); \
211 golden = random; \
212 golden.template chip<(CHIP_DIM)>(offset) = src; \
213 \
214 Tensor<T, NumDims, Layout, Index> dst(dims); \
215 dst = random; \
216 auto expr = dst.template chip<(CHIP_DIM)>(offset); \
217 \
218 using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
219 using Executor = internal::TensorExecutor<const Assign, Device, \
220 Vectorizable, Tiling>; \
221 \
222 Executor::run(Assign(expr, src), d); \
223 \
224 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
225 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
226 } \
227 }
228
229 TEST_CHIPPING(0)
230 TEST_CHIPPING(1)
231 TEST_CHIPPING(2)
232 TEST_CHIPPING(3)
233 TEST_CHIPPING(4)
234 TEST_CHIPPING(5)
235
236 #undef TEST_CHIPPING
237 }
238
239 template <typename T, int NumDims, typename Device, bool Vectorizable,
240 TiledEvaluation Tiling, int Layout>
test_execute_shuffle_rvalue(Device d)241 static void test_execute_shuffle_rvalue(Device d)
242 {
243 static constexpr int Options = 0 | Layout;
244
245 auto dims = RandomDims<NumDims>(1, 10);
246 Tensor<T, NumDims, Options, Index> src(dims);
247 src.setRandom();
248
249 DSizes<Index, NumDims> shuffle;
250 for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
251
252 // Test all possible shuffle permutations.
253 do {
254 DSizes<Index, NumDims> shuffled_dims;
255 for (int i = 0; i < NumDims; ++i) {
256 shuffled_dims[i] = dims[shuffle[i]];
257 }
258
259 const auto expr = src.shuffle(shuffle);
260
261 // We assume that shuffling on a default device is tested and correct, so
262 // we can rely on it to verify correctness of tensor executor and tiling.
263 Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
264 DefaultAssign(golden, expr);
265
266 // Now do the shuffling using configured tensor executor.
267 Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
268 DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
269
270 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
271 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
272 }
273
274 } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
275 }
276
277 template <typename T, int NumDims, typename Device, bool Vectorizable,
278 TiledEvaluation Tiling, int Layout>
test_execute_shuffle_lvalue(Device d)279 static void test_execute_shuffle_lvalue(Device d)
280 {
281 static constexpr int Options = 0 | Layout;
282
283 auto dims = RandomDims<NumDims>(5, 10);
284 Tensor<T, NumDims, Options, Index> src(dims);
285 src.setRandom();
286
287 DSizes<Index, NumDims> shuffle;
288 for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
289
290 // Test all possible shuffle permutations.
291 do {
292 DSizes<Index, NumDims> shuffled_dims;
293 for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
294
295 // We assume that shuffling on a default device is tested and correct, so
296 // we can rely on it to verify correctness of tensor executor and tiling.
297 Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
298 auto golden_shuffle = golden.shuffle(shuffle);
299 DefaultAssign(golden_shuffle, src);
300
301 // Now do the shuffling using configured tensor executor.
302 Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
303 auto dst_shuffle = dst.shuffle(shuffle);
304 DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
305
306 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
307 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
308 }
309
310 } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
311 }
312
313 template <typename T, int NumDims, typename Device, bool Vectorizable,
314 TiledEvaluation Tiling, int Layout>
test_execute_reshape(Device d)315 static void test_execute_reshape(Device d)
316 {
317 static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
318
319 static constexpr int ReshapedDims = NumDims - 1;
320 static constexpr int Options = 0 | Layout;
321
322 auto dims = RandomDims<NumDims>(5, 10);
323 Tensor<T, NumDims, Options, Index> src(dims);
324 src.setRandom();
325
326 // Multiple 0th dimension and then shuffle.
327 std::vector<Index> shuffle;
328 for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
329 std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
330
331 DSizes<Index, ReshapedDims> reshaped_dims;
332 reshaped_dims[shuffle[0]] = dims[0] * dims[1];
333 for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
334
335 Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
336
337 // Now reshape using configured tensor executor.
338 Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
339
340 auto expr = src.reshape(reshaped_dims);
341
342 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
343 using Executor =
344 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
345
346 Executor::run(Assign(dst, expr), d);
347
348 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
349 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
350 }
351 }
352
353 template <typename T, int NumDims, typename Device, bool Vectorizable,
354 TiledEvaluation Tiling, int Layout>
test_execute_slice_rvalue(Device d)355 static void test_execute_slice_rvalue(Device d)
356 {
357 static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
358 static constexpr int Options = 0 | Layout;
359
360 auto dims = RandomDims<NumDims>(5, 10);
361 Tensor<T, NumDims, Options, Index> src(dims);
362 src.setRandom();
363
364 // Pick a random slice of src tensor.
365 auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
366 auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
367
368 // Make sure that slice start + size do not overflow tensor dims.
369 for (int i = 0; i < NumDims; ++i) {
370 slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
371 slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
372 }
373
374 Tensor<T, NumDims, Options, Index> golden =
375 src.slice(slice_start, slice_size);
376
377 // Now reshape using configured tensor executor.
378 Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
379
380 auto expr = src.slice(slice_start, slice_size);
381
382 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
383 using Executor =
384 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
385
386 Executor::run(Assign(dst, expr), d);
387
388 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
389 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
390 }
391 }
392
393 template <typename T, int NumDims, typename Device, bool Vectorizable,
394 TiledEvaluation Tiling, int Layout>
test_execute_slice_lvalue(Device d)395 static void test_execute_slice_lvalue(Device d)
396 {
397 static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
398 static constexpr int Options = 0 | Layout;
399
400 auto dims = RandomDims<NumDims>(5, 10);
401 Tensor<T, NumDims, Options, Index> src(dims);
402 src.setRandom();
403
404 // Pick a random slice of src tensor.
405 auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
406 auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
407
408 // Make sure that slice start + size do not overflow tensor dims.
409 for (int i = 0; i < NumDims; ++i) {
410 slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
411 slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
412 }
413
414 Tensor<T, NumDims, Options, Index> slice(slice_size);
415 slice.setRandom();
416
417 // Assign a slice using default executor.
418 Tensor<T, NumDims, Options, Index> golden = src;
419 golden.slice(slice_start, slice_size) = slice;
420
421 // And using configured execution strategy.
422 Tensor<T, NumDims, Options, Index> dst = src;
423 auto expr = dst.slice(slice_start, slice_size);
424
425 using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
426 using Executor =
427 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
428
429 Executor::run(Assign(expr, slice), d);
430
431 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
432 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
433 }
434 }
435
436 template <typename T, int NumDims, typename Device, bool Vectorizable,
437 TiledEvaluation Tiling, int Layout>
test_execute_broadcasting_of_forced_eval(Device d)438 static void test_execute_broadcasting_of_forced_eval(Device d)
439 {
440 static constexpr int Options = 0 | Layout;
441
442 auto dims = RandomDims<NumDims>(1, 10);
443 Tensor<T, NumDims, Options, Index> src(dims);
444 src.setRandom();
445
446 const auto broadcasts = RandomDims<NumDims>(1, 7);
447 const auto expr = src.square().eval().broadcast(broadcasts);
448
449 // We assume that broadcasting on a default device is tested and correct, so
450 // we can rely on it to verify correctness of tensor executor and tiling.
451 Tensor<T, NumDims, Options, Index> golden;
452 golden = expr;
453
454 // Now do the broadcasting using configured tensor executor.
455 Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
456
457 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
458 using Executor =
459 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
460
461 Executor::run(Assign(dst, expr), d);
462
463 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
464 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
465 }
466 }
467
468 template<typename T, int NumDims>
469 struct DummyGenerator {
470 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
operator ()DummyGenerator471 T operator()(const array <Index, NumDims>& dims) const {
472 T result = static_cast<T>(0);
473 for (int i = 0; i < NumDims; ++i) {
474 result += static_cast<T>((i + 1) * dims[i]);
475 }
476 return result;
477 }
478 };
479
480 template <typename T, int NumDims, typename Device, bool Vectorizable,
481 TiledEvaluation Tiling, int Layout>
test_execute_generator_op(Device d)482 static void test_execute_generator_op(Device d)
483 {
484 static constexpr int Options = 0 | Layout;
485
486 auto dims = RandomDims<NumDims>(20, 30);
487 Tensor<T, NumDims, Options, Index> src(dims);
488 src.setRandom();
489
490 const auto expr = src.generate(DummyGenerator<T, NumDims>());
491
492 // We assume that generator on a default device is tested and correct, so
493 // we can rely on it to verify correctness of tensor executor and tiling.
494 Tensor<T, NumDims, Options, Index> golden;
495 golden = expr;
496
497 // Now do the broadcasting using configured tensor executor.
498 Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
499
500 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
501 using Executor =
502 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
503
504 Executor::run(Assign(dst, expr), d);
505
506 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
507 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
508 }
509 }
510
511 template <typename T, int NumDims, typename Device, bool Vectorizable,
512 TiledEvaluation Tiling, int Layout>
test_execute_reverse_rvalue(Device d)513 static void test_execute_reverse_rvalue(Device d)
514 {
515 static constexpr int Options = 0 | Layout;
516
517 auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
518 Tensor <T, NumDims, Options, Index> src(dims);
519 src.setRandom();
520
521 // Reverse half of the dimensions.
522 Eigen::array<bool, NumDims> reverse;
523 for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
524
525 const auto expr = src.reverse(reverse);
526
527 // We assume that reversing on a default device is tested and correct, so
528 // we can rely on it to verify correctness of tensor executor and tiling.
529 Tensor <T, NumDims, Options, Index> golden;
530 golden = expr;
531
532 // Now do the reversing using configured tensor executor.
533 Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
534
535 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
536 using Executor =
537 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
538
539 Executor::run(Assign(dst, expr), d);
540
541 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
542 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
543 }
544 }
545
546 template <typename T, int NumDims, typename Device, bool Vectorizable,
547 TiledEvaluation Tiling, int Layout>
test_async_execute_unary_expr(Device d)548 static void test_async_execute_unary_expr(Device d)
549 {
550 static constexpr int Options = 0 | Layout;
551
552 // Pick a large enough tensor size to bypass small tensor block evaluation
553 // optimization.
554 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
555
556 Tensor<T, NumDims, Options, Index> src(dims);
557 Tensor<T, NumDims, Options, Index> dst(dims);
558
559 src.setRandom();
560 const auto expr = src.square();
561
562 Eigen::Barrier done(1);
563 auto on_done = [&done]() { done.Notify(); };
564
565 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
566 using DoneCallback = decltype(on_done);
567 using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
568 Vectorizable, Tiling>;
569
570 Executor::runAsync(Assign(dst, expr), d, on_done);
571 done.Wait();
572
573 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
574 T square = src.coeff(i) * src.coeff(i);
575 VERIFY_IS_EQUAL(square, dst.coeff(i));
576 }
577 }
578
579 template <typename T, int NumDims, typename Device, bool Vectorizable,
580 TiledEvaluation Tiling, int Layout>
test_async_execute_binary_expr(Device d)581 static void test_async_execute_binary_expr(Device d)
582 {
583 static constexpr int Options = 0 | Layout;
584
585 // Pick a large enough tensor size to bypass small tensor block evaluation
586 // optimization.
587 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
588
589 Tensor<T, NumDims, Options, Index> lhs(dims);
590 Tensor<T, NumDims, Options, Index> rhs(dims);
591 Tensor<T, NumDims, Options, Index> dst(dims);
592
593 lhs.setRandom();
594 rhs.setRandom();
595
596 const auto expr = lhs + rhs;
597
598 Eigen::Barrier done(1);
599 auto on_done = [&done]() { done.Notify(); };
600
601 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
602 using DoneCallback = decltype(on_done);
603 using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
604 Vectorizable, Tiling>;
605
606 Executor::runAsync(Assign(dst, expr), d, on_done);
607 done.Wait();
608
609 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
610 T sum = lhs.coeff(i) + rhs.coeff(i);
611 VERIFY_IS_EQUAL(sum, dst.coeff(i));
612 }
613 }
614
615 #ifdef EIGEN_DONT_VECTORIZE
616 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
617 #else
618 #define VECTORIZABLE(VAL) VAL
619 #endif
620
621 #define CALL_SUBTEST_PART(PART) \
622 CALL_SUBTEST_##PART
623
624 #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
625 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
626 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
627 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
628 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
629 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
630 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
631 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
632 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
633 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
634 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
635 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
636 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
637 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
638 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
639 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
640 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
641
642 // NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
643 #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
644 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
645 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
646 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
647 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
648 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
649 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
650 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
651 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
652
EIGEN_DECLARE_TEST(cxx11_tensor_executor)653 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
654 Eigen::DefaultDevice default_device;
655 // Default device is unused in ASYNC tests.
656 EIGEN_UNUSED_VARIABLE(default_device);
657
658 const auto num_threads = internal::random<int>(20, 24);
659 Eigen::ThreadPool tp(num_threads);
660 Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
661
662 CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
663 CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
664 CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
665
666 CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
667 CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
668 CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
669
670 CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
671 CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
672 CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
673
674 CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
675 CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
676 CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
677
678 CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
679 CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
680 CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
681
682 CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
683 CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
684 CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
685
686 CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
687 CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
688 CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
689
690 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
691 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
692 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
693 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
694
695 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
696 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
697 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
698 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
699
700 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
701 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
702 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
703 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
704
705 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
706 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
707 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
708 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
709
710 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
711 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
712 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
713 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
714
715 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
716 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
717 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
718 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
719 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
720
721 CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
722 CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
723 CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
724
725 CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
726 CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
727 CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
728
729 // Force CMake to split this test.
730 // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
731 }
732