• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_USE_THREADS
11 
12 #include "main.h"
13 
14 #include <Eigen/CXX11/Tensor>
15 
16 using Eigen::Tensor;
17 using Eigen::RowMajor;
18 using Eigen::ColMajor;
19 using Eigen::internal::TiledEvaluation;
20 
21 // A set of tests to verify that different TensorExecutor strategies yields the
22 // same results for all the ops, supporting tiled evaluation.
23 
24 // Default assignment that does no use block evaluation or vectorization.
25 // We assume that default coefficient evaluation is well tested and correct.
26 template <typename Dst, typename Expr>
DefaultAssign(Dst & dst,Expr expr)27 static void DefaultAssign(Dst& dst, Expr expr) {
28   using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
29   using Executor =
30       Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
31                                       /*Vectorizable=*/false,
32                                       /*Tiling=*/TiledEvaluation::Off>;
33 
34   Executor::run(Assign(dst, expr), DefaultDevice());
35 }
36 
37 // Assignment with specified device and tiling strategy.
38 template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
39           typename Dst, typename Expr>
DeviceAssign(Device & d,Dst & dst,Expr expr)40 static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
41   using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
42   using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
43                                                    Vectorizable, Tiling>;
44 
45   Executor::run(Assign(dst, expr), d);
46 }
47 
48 template <int NumDims>
RandomDims(int min_dim=1,int max_dim=20)49 static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
50   array<Index, NumDims> dims;
51   for (int i = 0; i < NumDims; ++i) {
52     dims[i] = internal::random<int>(min_dim, max_dim);
53   }
54   return dims;
55 }
56 
57 template <typename T, int NumDims, typename Device, bool Vectorizable,
58           TiledEvaluation Tiling, int Layout>
test_execute_unary_expr(Device d)59 static void test_execute_unary_expr(Device d)
60 {
61   static constexpr int Options = 0 | Layout;
62 
63   // Pick a large enough tensor size to bypass small tensor block evaluation
64   // optimization.
65   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
66 
67   Tensor<T, NumDims, Options, Index> src(dims);
68   Tensor<T, NumDims, Options, Index> dst(dims);
69 
70   src.setRandom();
71   const auto expr = src.square();
72 
73   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
74   using Executor =
75       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
76 
77   Executor::run(Assign(dst, expr), d);
78 
79   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
80     T square = src.coeff(i) * src.coeff(i);
81     VERIFY_IS_EQUAL(square, dst.coeff(i));
82   }
83 }
84 
85 template <typename T, int NumDims, typename Device, bool Vectorizable,
86           TiledEvaluation Tiling, int Layout>
test_execute_binary_expr(Device d)87 static void test_execute_binary_expr(Device d)
88 {
89   static constexpr int Options = 0 | Layout;
90 
91   // Pick a large enough tensor size to bypass small tensor block evaluation
92   // optimization.
93   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
94 
95   Tensor<T, NumDims, Options, Index> lhs(dims);
96   Tensor<T, NumDims, Options, Index> rhs(dims);
97   Tensor<T, NumDims, Options, Index> dst(dims);
98 
99   lhs.setRandom();
100   rhs.setRandom();
101 
102   const auto expr = lhs + rhs;
103 
104   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
105   using Executor =
106       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
107 
108   Executor::run(Assign(dst, expr), d);
109 
110   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
111     T sum = lhs.coeff(i) + rhs.coeff(i);
112     VERIFY_IS_EQUAL(sum, dst.coeff(i));
113   }
114 }
115 
116 template <typename T, int NumDims, typename Device, bool Vectorizable,
117           TiledEvaluation Tiling, int Layout>
test_execute_broadcasting(Device d)118 static void test_execute_broadcasting(Device d)
119 {
120   static constexpr int Options = 0 | Layout;
121 
122   auto dims = RandomDims<NumDims>(1, 10);
123   Tensor<T, NumDims, Options, Index> src(dims);
124   src.setRandom();
125 
126   const auto broadcasts = RandomDims<NumDims>(1, 7);
127   const auto expr = src.broadcast(broadcasts);
128 
129   // We assume that broadcasting on a default device is tested and correct, so
130   // we can rely on it to verify correctness of tensor executor and tiling.
131   Tensor<T, NumDims, Options, Index> golden;
132   golden = expr;
133 
134   // Now do the broadcasting using configured tensor executor.
135   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
136 
137   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
138   using Executor =
139       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
140 
141   Executor::run(Assign(dst, expr), d);
142 
143   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
144     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
145   }
146 }
147 
148 template <typename T, int NumDims, typename Device, bool Vectorizable,
149           TiledEvaluation Tiling, int Layout>
test_execute_chipping_rvalue(Device d)150 static void test_execute_chipping_rvalue(Device d)
151 {
152   auto dims = RandomDims<NumDims>(1, 10);
153   Tensor<T, NumDims, Layout, Index> src(dims);
154   src.setRandom();
155 
156 #define TEST_CHIPPING(CHIP_DIM)                                           \
157   if (NumDims > (CHIP_DIM)) {                                             \
158     const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
159     const auto expr = src.template chip<(CHIP_DIM)>(offset);              \
160                                                                           \
161     Tensor<T, NumDims - 1, Layout, Index> golden;                         \
162     golden = expr;                                                        \
163                                                                           \
164     Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \
165                                                                           \
166     using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
167     using Executor = internal::TensorExecutor<const Assign, Device,       \
168                                               Vectorizable, Tiling>;      \
169                                                                           \
170     Executor::run(Assign(dst, expr), d);                                  \
171                                                                           \
172     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \
173       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \
174     }                                                                     \
175   }
176 
177   TEST_CHIPPING(0)
178   TEST_CHIPPING(1)
179   TEST_CHIPPING(2)
180   TEST_CHIPPING(3)
181   TEST_CHIPPING(4)
182   TEST_CHIPPING(5)
183 
184 #undef TEST_CHIPPING
185 }
186 
187 template <typename T, int NumDims, typename Device, bool Vectorizable,
188     TiledEvaluation Tiling, int Layout>
test_execute_chipping_lvalue(Device d)189 static void test_execute_chipping_lvalue(Device d)
190 {
191   auto dims = RandomDims<NumDims>(1, 10);
192 
193 #define TEST_CHIPPING(CHIP_DIM)                                             \
194   if (NumDims > (CHIP_DIM)) {                                               \
195     /* Generate random data that we'll assign to the chipped tensor dim. */ \
196     array<Index, NumDims - 1> src_dims;                                     \
197     for (int i = 0; i < NumDims - 1; ++i) {                                 \
198       int dim = i < (CHIP_DIM) ? i : i + 1;                                 \
199       src_dims[i] = dims[dim];                                              \
200     }                                                                       \
201                                                                             \
202     Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \
203     src.setRandom();                                                        \
204                                                                             \
205     const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
206                                                                             \
207     Tensor<T, NumDims, Layout, Index> random(dims);                         \
208     random.setZero();                                                       \
209                                                                             \
210     Tensor<T, NumDims, Layout, Index> golden(dims);                         \
211     golden = random;                                                        \
212     golden.template chip<(CHIP_DIM)>(offset) = src;                         \
213                                                                             \
214     Tensor<T, NumDims, Layout, Index> dst(dims);                            \
215     dst = random;                                                           \
216     auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \
217                                                                             \
218     using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
219     using Executor = internal::TensorExecutor<const Assign, Device,         \
220                                               Vectorizable, Tiling>;        \
221                                                                             \
222     Executor::run(Assign(expr, src), d);                                    \
223                                                                             \
224     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \
225       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \
226     }                                                                       \
227   }
228 
229   TEST_CHIPPING(0)
230   TEST_CHIPPING(1)
231   TEST_CHIPPING(2)
232   TEST_CHIPPING(3)
233   TEST_CHIPPING(4)
234   TEST_CHIPPING(5)
235 
236 #undef TEST_CHIPPING
237 }
238 
239 template <typename T, int NumDims, typename Device, bool Vectorizable,
240           TiledEvaluation Tiling, int Layout>
test_execute_shuffle_rvalue(Device d)241 static void test_execute_shuffle_rvalue(Device d)
242 {
243   static constexpr int Options = 0 | Layout;
244 
245   auto dims = RandomDims<NumDims>(1, 10);
246   Tensor<T, NumDims, Options, Index> src(dims);
247   src.setRandom();
248 
249   DSizes<Index, NumDims> shuffle;
250   for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
251 
252   // Test all possible shuffle permutations.
253   do {
254     DSizes<Index, NumDims> shuffled_dims;
255     for (int i = 0; i < NumDims; ++i) {
256       shuffled_dims[i] = dims[shuffle[i]];
257     }
258 
259     const auto expr = src.shuffle(shuffle);
260 
261     // We assume that shuffling on a default device is tested and correct, so
262     // we can rely on it to verify correctness of tensor executor and tiling.
263     Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
264     DefaultAssign(golden, expr);
265 
266     // Now do the shuffling using configured tensor executor.
267     Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
268     DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
269 
270     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
271       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
272     }
273 
274   } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
275 }
276 
277 template <typename T, int NumDims, typename Device, bool Vectorizable,
278           TiledEvaluation Tiling, int Layout>
test_execute_shuffle_lvalue(Device d)279 static void test_execute_shuffle_lvalue(Device d)
280 {
281   static constexpr int Options = 0 | Layout;
282 
283   auto dims = RandomDims<NumDims>(5, 10);
284   Tensor<T, NumDims, Options, Index> src(dims);
285   src.setRandom();
286 
287   DSizes<Index, NumDims> shuffle;
288   for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
289 
290   // Test all possible shuffle permutations.
291   do {
292     DSizes<Index, NumDims> shuffled_dims;
293     for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
294 
295     // We assume that shuffling on a default device is tested and correct, so
296     // we can rely on it to verify correctness of tensor executor and tiling.
297     Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
298     auto golden_shuffle = golden.shuffle(shuffle);
299     DefaultAssign(golden_shuffle, src);
300 
301     // Now do the shuffling using configured tensor executor.
302     Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
303     auto dst_shuffle = dst.shuffle(shuffle);
304     DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
305 
306     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
307       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
308     }
309 
310   } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
311 }
312 
313 template <typename T, int NumDims, typename Device, bool Vectorizable,
314     TiledEvaluation Tiling, int Layout>
test_execute_reshape(Device d)315 static void test_execute_reshape(Device d)
316 {
317   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
318 
319   static constexpr int ReshapedDims = NumDims - 1;
320   static constexpr int Options = 0 | Layout;
321 
322   auto dims = RandomDims<NumDims>(5, 10);
323   Tensor<T, NumDims, Options, Index> src(dims);
324   src.setRandom();
325 
326   // Multiple 0th dimension and then shuffle.
327   std::vector<Index> shuffle;
328   for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
329   std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
330 
331   DSizes<Index, ReshapedDims> reshaped_dims;
332   reshaped_dims[shuffle[0]] = dims[0] * dims[1];
333   for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
334 
335   Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
336 
337   // Now reshape using configured tensor executor.
338   Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
339 
340   auto expr = src.reshape(reshaped_dims);
341 
342   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
343   using Executor =
344       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
345 
346   Executor::run(Assign(dst, expr), d);
347 
348   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
349     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
350   }
351 }
352 
353 template <typename T, int NumDims, typename Device, bool Vectorizable,
354           TiledEvaluation Tiling, int Layout>
test_execute_slice_rvalue(Device d)355 static void test_execute_slice_rvalue(Device d)
356 {
357   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
358   static constexpr int Options = 0 | Layout;
359 
360   auto dims = RandomDims<NumDims>(5, 10);
361   Tensor<T, NumDims, Options, Index> src(dims);
362   src.setRandom();
363 
364   // Pick a random slice of src tensor.
365   auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
366   auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
367 
368   // Make sure that slice start + size do not overflow tensor dims.
369   for (int i = 0; i < NumDims; ++i) {
370     slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
371     slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
372   }
373 
374   Tensor<T, NumDims, Options, Index> golden =
375       src.slice(slice_start, slice_size);
376 
377   // Now reshape using configured tensor executor.
378   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
379 
380   auto expr = src.slice(slice_start, slice_size);
381 
382   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
383   using Executor =
384       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
385 
386   Executor::run(Assign(dst, expr), d);
387 
388   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
389     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
390   }
391 }
392 
393 template <typename T, int NumDims, typename Device, bool Vectorizable,
394     TiledEvaluation Tiling, int Layout>
test_execute_slice_lvalue(Device d)395 static void test_execute_slice_lvalue(Device d)
396 {
397   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
398   static constexpr int Options = 0 | Layout;
399 
400   auto dims = RandomDims<NumDims>(5, 10);
401   Tensor<T, NumDims, Options, Index> src(dims);
402   src.setRandom();
403 
404   // Pick a random slice of src tensor.
405   auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
406   auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
407 
408   // Make sure that slice start + size do not overflow tensor dims.
409   for (int i = 0; i < NumDims; ++i) {
410     slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
411     slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
412   }
413 
414   Tensor<T, NumDims, Options, Index> slice(slice_size);
415   slice.setRandom();
416 
417   // Assign a slice using default executor.
418   Tensor<T, NumDims, Options, Index> golden = src;
419   golden.slice(slice_start, slice_size) = slice;
420 
421   // And using configured execution strategy.
422   Tensor<T, NumDims, Options, Index> dst = src;
423   auto expr = dst.slice(slice_start, slice_size);
424 
425   using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
426   using Executor =
427       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
428 
429   Executor::run(Assign(expr, slice), d);
430 
431   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
432     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
433   }
434 }
435 
436 template <typename T, int NumDims, typename Device, bool Vectorizable,
437     TiledEvaluation Tiling, int Layout>
test_execute_broadcasting_of_forced_eval(Device d)438 static void test_execute_broadcasting_of_forced_eval(Device d)
439 {
440   static constexpr int Options = 0 | Layout;
441 
442   auto dims = RandomDims<NumDims>(1, 10);
443   Tensor<T, NumDims, Options, Index> src(dims);
444   src.setRandom();
445 
446   const auto broadcasts = RandomDims<NumDims>(1, 7);
447   const auto expr = src.square().eval().broadcast(broadcasts);
448 
449   // We assume that broadcasting on a default device is tested and correct, so
450   // we can rely on it to verify correctness of tensor executor and tiling.
451   Tensor<T, NumDims, Options, Index> golden;
452   golden = expr;
453 
454   // Now do the broadcasting using configured tensor executor.
455   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
456 
457   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
458   using Executor =
459       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
460 
461   Executor::run(Assign(dst, expr), d);
462 
463   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
464     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
465   }
466 }
467 
468 template<typename T, int NumDims>
469 struct DummyGenerator {
470   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
operator ()DummyGenerator471   T operator()(const array <Index, NumDims>& dims) const {
472     T result = static_cast<T>(0);
473     for (int i = 0; i < NumDims; ++i) {
474       result += static_cast<T>((i + 1) * dims[i]);
475     }
476     return result;
477   }
478 };
479 
480 template <typename T, int NumDims, typename Device, bool Vectorizable,
481     TiledEvaluation Tiling, int Layout>
test_execute_generator_op(Device d)482 static void test_execute_generator_op(Device d)
483 {
484   static constexpr int Options = 0 | Layout;
485 
486   auto dims = RandomDims<NumDims>(20, 30);
487   Tensor<T, NumDims, Options, Index> src(dims);
488   src.setRandom();
489 
490   const auto expr = src.generate(DummyGenerator<T, NumDims>());
491 
492   // We assume that generator on a default device is tested and correct, so
493   // we can rely on it to verify correctness of tensor executor and tiling.
494   Tensor<T, NumDims, Options, Index> golden;
495   golden = expr;
496 
497   // Now do the broadcasting using configured tensor executor.
498   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
499 
500   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
501   using Executor =
502     internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
503 
504   Executor::run(Assign(dst, expr), d);
505 
506   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
507     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
508   }
509 }
510 
511 template <typename T, int NumDims, typename Device, bool Vectorizable,
512     TiledEvaluation Tiling, int Layout>
test_execute_reverse_rvalue(Device d)513 static void test_execute_reverse_rvalue(Device d)
514 {
515   static constexpr int Options = 0 | Layout;
516 
517   auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
518   Tensor <T, NumDims, Options, Index> src(dims);
519   src.setRandom();
520 
521   // Reverse half of the dimensions.
522   Eigen::array<bool, NumDims> reverse;
523   for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
524 
525   const auto expr = src.reverse(reverse);
526 
527   // We assume that reversing on a default device is tested and correct, so
528   // we can rely on it to verify correctness of tensor executor and tiling.
529   Tensor <T, NumDims, Options, Index> golden;
530   golden = expr;
531 
532   // Now do the reversing using configured tensor executor.
533   Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
534 
535   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
536   using Executor =
537     internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
538 
539   Executor::run(Assign(dst, expr), d);
540 
541   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
542     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
543   }
544 }
545 
546 template <typename T, int NumDims, typename Device, bool Vectorizable,
547           TiledEvaluation Tiling, int Layout>
test_async_execute_unary_expr(Device d)548 static void test_async_execute_unary_expr(Device d)
549 {
550   static constexpr int Options = 0 | Layout;
551 
552   // Pick a large enough tensor size to bypass small tensor block evaluation
553   // optimization.
554   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
555 
556   Tensor<T, NumDims, Options, Index> src(dims);
557   Tensor<T, NumDims, Options, Index> dst(dims);
558 
559   src.setRandom();
560   const auto expr = src.square();
561 
562   Eigen::Barrier done(1);
563   auto on_done = [&done]() { done.Notify(); };
564 
565   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
566   using DoneCallback = decltype(on_done);
567   using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
568                                                  Vectorizable, Tiling>;
569 
570   Executor::runAsync(Assign(dst, expr), d, on_done);
571   done.Wait();
572 
573   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
574     T square = src.coeff(i) * src.coeff(i);
575     VERIFY_IS_EQUAL(square, dst.coeff(i));
576   }
577 }
578 
579 template <typename T, int NumDims, typename Device, bool Vectorizable,
580           TiledEvaluation Tiling, int Layout>
test_async_execute_binary_expr(Device d)581 static void test_async_execute_binary_expr(Device d)
582 {
583   static constexpr int Options = 0 | Layout;
584 
585   // Pick a large enough tensor size to bypass small tensor block evaluation
586   // optimization.
587   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
588 
589   Tensor<T, NumDims, Options, Index> lhs(dims);
590   Tensor<T, NumDims, Options, Index> rhs(dims);
591   Tensor<T, NumDims, Options, Index> dst(dims);
592 
593   lhs.setRandom();
594   rhs.setRandom();
595 
596   const auto expr = lhs + rhs;
597 
598   Eigen::Barrier done(1);
599   auto on_done = [&done]() { done.Notify(); };
600 
601   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
602   using DoneCallback = decltype(on_done);
603   using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
604                                                  Vectorizable, Tiling>;
605 
606   Executor::runAsync(Assign(dst, expr), d, on_done);
607   done.Wait();
608 
609   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
610     T sum = lhs.coeff(i) + rhs.coeff(i);
611     VERIFY_IS_EQUAL(sum, dst.coeff(i));
612   }
613 }
614 
615 #ifdef EIGEN_DONT_VECTORIZE
616 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
617 #else
618 #define VECTORIZABLE(VAL) VAL
619 #endif
620 
621 #define CALL_SUBTEST_PART(PART) \
622   CALL_SUBTEST_##PART
623 
624 #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
625   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
626   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
627   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
628   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
629   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
630   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
631   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
632   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
633   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
634   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
635   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
636   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
637   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
638   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
639   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
640   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
641 
642 // NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
643 #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
644   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
645   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
646   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
647   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
648   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
649   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
650   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
651   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
652 
EIGEN_DECLARE_TEST(cxx11_tensor_executor)653 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
654   Eigen::DefaultDevice default_device;
655   // Default device is unused in ASYNC tests.
656   EIGEN_UNUSED_VARIABLE(default_device);
657 
658   const auto num_threads = internal::random<int>(20, 24);
659   Eigen::ThreadPool tp(num_threads);
660   Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
661 
662   CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
663   CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
664   CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
665 
666   CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
667   CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
668   CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
669 
670   CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
671   CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
672   CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
673 
674   CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
675   CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
676   CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
677 
678   CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
679   CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
680   CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
681 
682   CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
683   CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
684   CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
685 
686   CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
687   CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
688   CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
689 
690   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
691   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
692   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
693   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
694 
695   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
696   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
697   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
698   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
699 
700   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
701   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
702   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
703   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
704 
705   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
706   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
707   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
708   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
709 
710   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
711   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
712   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
713   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
714 
715   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
716   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
717   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
718   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
719   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
720 
721   CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
722   CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
723   CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
724 
725   CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
726   CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
727   CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
728 
729   // Force CMake to split this test.
730   // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
731 }
732