• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_USE_THREADS
11 
12 
13 #include "main.h"
14 #include <iostream>
15 #include <Eigen/CXX11/Tensor>
16 
17 using Eigen::Tensor;
18 
19 class TestAllocator : public Allocator {
20  public:
~TestAllocator()21   ~TestAllocator() EIGEN_OVERRIDE {}
allocate(size_t num_bytes) const22   EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
23     const_cast<TestAllocator*>(this)->alloc_count_++;
24     return internal::aligned_malloc(num_bytes);
25   }
deallocate(void * buffer) const26   EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
27     const_cast<TestAllocator*>(this)->dealloc_count_++;
28     internal::aligned_free(buffer);
29   }
30 
alloc_count() const31   int alloc_count() const { return alloc_count_; }
dealloc_count() const32   int dealloc_count() const { return dealloc_count_; }
33 
34  private:
35   int alloc_count_ = 0;
36   int dealloc_count_ = 0;
37 };
38 
test_multithread_elementwise()39 void test_multithread_elementwise()
40 {
41   Tensor<float, 3> in1(200, 30, 70);
42   Tensor<float, 3> in2(200, 30, 70);
43   Tensor<double, 3> out(200, 30, 70);
44 
45   in1.setRandom();
46   in2.setRandom();
47 
48   Eigen::ThreadPool tp(internal::random<int>(3, 11));
49   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
50   out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
51 
52   for (int i = 0; i < 200; ++i) {
53     for (int j = 0; j < 30; ++j) {
54       for (int k = 0; k < 70; ++k) {
55         VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
56       }
57     }
58   }
59 }
60 
test_async_multithread_elementwise()61 void test_async_multithread_elementwise()
62 {
63   Tensor<float, 3> in1(200, 30, 70);
64   Tensor<float, 3> in2(200, 30, 70);
65   Tensor<double, 3> out(200, 30, 70);
66 
67   in1.setRandom();
68   in2.setRandom();
69 
70   Eigen::ThreadPool tp(internal::random<int>(3, 11));
71   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
72 
73   Eigen::Barrier b(1);
74   out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
75   b.Wait();
76 
77   for (int i = 0; i < 200; ++i) {
78     for (int j = 0; j < 30; ++j) {
79       for (int k = 0; k < 70; ++k) {
80         VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
81       }
82     }
83   }
84 }
85 
test_multithread_compound_assignment()86 void test_multithread_compound_assignment()
87 {
88   Tensor<float, 3> in1(2,3,7);
89   Tensor<float, 3> in2(2,3,7);
90   Tensor<float, 3> out(2,3,7);
91 
92   in1.setRandom();
93   in2.setRandom();
94 
95   Eigen::ThreadPool tp(internal::random<int>(3, 11));
96   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
97   out.device(thread_pool_device) = in1;
98   out.device(thread_pool_device) += in2 * 3.14f;
99 
100   for (int i = 0; i < 2; ++i) {
101     for (int j = 0; j < 3; ++j) {
102       for (int k = 0; k < 7; ++k) {
103         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
104       }
105     }
106   }
107 }
108 
109 template<int DataLayout>
test_multithread_contraction()110 void test_multithread_contraction()
111 {
112   Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
113   Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
114   Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
115 
116   t_left.setRandom();
117   t_right.setRandom();
118 
119   // this contraction should be equivalent to a single matrix multiplication
120   typedef Tensor<float, 1>::DimensionPair DimPair;
121   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
122 
123   typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
124   MapXf m_left(t_left.data(), 1500, 1147);
125   MapXf m_right(t_right.data(), 1147, 1400);
126   Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
127 
128   Eigen::ThreadPool tp(4);
129   Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
130 
131   // compute results by separate methods
132   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
133   m_result = m_left * m_right;
134 
135  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
136     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
137     if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
138       continue;
139     }
140     if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
141       continue;
142     }
143     std::cout << "mismatch detected at index " << i << ": " << t_result(i)
144               << " vs " <<  m_result(i) << std::endl;
145     assert(false);
146   }
147 }
148 
149 template<int DataLayout>
test_contraction_corner_cases()150 void test_contraction_corner_cases()
151 {
152   Tensor<float, 2, DataLayout> t_left(32, 500);
153   Tensor<float, 2, DataLayout> t_right(32, 28*28);
154   Tensor<float, 2, DataLayout> t_result(500, 28*28);
155 
156   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
157   t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
158   t_result = t_result.constant(NAN);
159 
160   // this contraction should be equivalent to a single matrix multiplication
161   typedef Tensor<float, 1>::DimensionPair DimPair;
162   Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
163 
164   typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
165   MapXf m_left(t_left.data(), 32, 500);
166   MapXf m_right(t_right.data(), 32, 28*28);
167   Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
168 
169   Eigen::ThreadPool tp(12);
170   Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
171 
172   // compute results by separate methods
173   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
174   m_result = m_left.transpose() * m_right;
175 
176   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
177     assert(!(numext::isnan)(t_result.data()[i]));
178     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
179       std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
180       assert(false);
181     }
182   }
183 
184   t_left.resize(32, 1);
185   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
186   t_result.resize (1, 28*28);
187   t_result = t_result.constant(NAN);
188   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
189   new(&m_left) MapXf(t_left.data(), 32, 1);
190   m_result = m_left.transpose() * m_right;
191   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
192     assert(!(numext::isnan)(t_result.data()[i]));
193     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
194       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
195       assert(false);
196     }
197   }
198 
199   t_left.resize(32, 500);
200   t_right.resize(32, 4);
201   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
202   t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
203   t_result.resize (500, 4);
204   t_result = t_result.constant(NAN);
205   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
206   new(&m_left) MapXf(t_left.data(), 32, 500);
207   new(&m_right) MapXf(t_right.data(), 32, 4);
208   m_result = m_left.transpose() * m_right;
209   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
210     assert(!(numext::isnan)(t_result.data()[i]));
211     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
212       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
213       assert(false);
214     }
215   }
216 
217   t_left.resize(32, 1);
218   t_right.resize(32, 4);
219   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
220   t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
221   t_result.resize (1, 4);
222   t_result = t_result.constant(NAN);
223   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
224   new(&m_left) MapXf(t_left.data(), 32, 1);
225   new(&m_right) MapXf(t_right.data(), 32, 4);
226   m_result = m_left.transpose() * m_right;
227   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
228     assert(!(numext::isnan)(t_result.data()[i]));
229     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
230       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
231       assert(false);
232     }
233   }
234 }
235 
236 template<int DataLayout>
test_multithread_contraction_agrees_with_singlethread()237 void test_multithread_contraction_agrees_with_singlethread() {
238   int contract_size = internal::random<int>(1, 5000);
239 
240   Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
241                                     contract_size,
242                                     internal::random<int>(1, 100));
243 
244   Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
245                                      internal::random<int>(1, 37),
246                                      contract_size,
247                                      internal::random<int>(1, 51));
248 
249   left.setRandom();
250   right.setRandom();
251 
252   // add constants to shift values away from 0 for more precision
253   left += left.constant(1.5f);
254   right += right.constant(1.5f);
255 
256   typedef Tensor<float, 1>::DimensionPair DimPair;
257   Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
258 
259   Eigen::ThreadPool tp(internal::random<int>(2, 11));
260   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
261 
262   Tensor<float, 5, DataLayout> st_result;
263   st_result = left.contract(right, dims);
264 
265   Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
266   tp_result.device(thread_pool_device) = left.contract(right, dims);
267 
268   VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
269   for (ptrdiff_t i = 0; i < st_result.size(); i++) {
270     // if both of the values are very small, then do nothing (because the test will fail
271     // due to numerical precision issues when values are small)
272     if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
273       VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
274     }
275   }
276 }
277 
278 // Apply Sqrt to all output elements.
279 struct SqrtOutputKernel {
280   template <typename Index, typename Scalar>
operator ()SqrtOutputKernel281   EIGEN_ALWAYS_INLINE void operator()(
282       const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
283       const TensorContractionParams&, Index, Index, Index num_rows,
284       Index num_cols) const {
285     for (int i = 0; i < num_rows; ++i) {
286       for (int j = 0; j < num_cols; ++j) {
287         output_mapper(i, j) = std::sqrt(output_mapper(i, j));
288       }
289     }
290   }
291 };
292 
293 template <int DataLayout>
test_multithread_contraction_with_output_kernel()294 static void test_multithread_contraction_with_output_kernel() {
295   typedef Tensor<float, 1>::DimensionPair DimPair;
296 
297   const int num_threads = internal::random<int>(2, 11);
298   ThreadPool threads(num_threads);
299   Eigen::ThreadPoolDevice device(&threads, num_threads);
300 
301   Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
302   Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
303   Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
304 
305   t_left.setRandom();
306   t_right.setRandom();
307   // Put trash in mat4 to verify contraction clears output memory.
308   t_result.setRandom();
309 
310   // Add a little offset so that the results won't be close to zero.
311   t_left += t_left.constant(1.0f);
312   t_right += t_right.constant(1.0f);
313 
314   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
315   MapXf m_left(t_left.data(), 1500, 248);
316   MapXf m_right(t_right.data(), 248, 1400);
317   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
318 
319   // this contraction should be equivalent to a single matrix multiplication
320   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
321 
322   // compute results by separate methods
323   t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
324 
325   m_result = m_left * m_right;
326 
327   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
328     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
329     VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
330   }
331 }
332 
333 template<int DataLayout>
test_async_multithread_contraction_agrees_with_singlethread()334 void test_async_multithread_contraction_agrees_with_singlethread()
335 {
336   int contract_size = internal::random<int>(100, 500);
337 
338   Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
339                                     contract_size,
340                                     internal::random<int>(10, 40));
341 
342   Tensor<float, 4, DataLayout> right(
343       internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
344       internal::random<int>(1, 20));
345 
346   left.setRandom();
347   right.setRandom();
348 
349   // add constants to shift values away from 0 for more precision
350   left += left.constant(1.5f);
351   right += right.constant(1.5f);
352 
353   typedef Tensor<float, 1>::DimensionPair DimPair;
354   Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
355 
356   Eigen::ThreadPool tp(internal::random<int>(2, 11));
357   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
358 
359   Tensor<float, 5, DataLayout> st_result;
360   st_result = left.contract(right, dims);
361 
362   Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
363 
364   Eigen::Barrier barrier(1);
365   tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
366       left.contract(right, dims);
367   barrier.Wait();
368 
369   VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
370   for (ptrdiff_t i = 0; i < st_result.size(); i++) {
371     // if both of the values are very small, then do nothing (because the test
372     // will fail due to numerical precision issues when values are small)
373     if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
374       VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
375     }
376   }
377 }
378 
379 // We are triggering 'evalShardedByInnerDim' optimization.
380 template <int DataLayout>
test_sharded_by_inner_dim_contraction()381 static void test_sharded_by_inner_dim_contraction()
382 {
383   typedef Tensor<float, 1>::DimensionPair DimPair;
384 
385   const int num_threads = internal::random<int>(4, 16);
386   ThreadPool threads(num_threads);
387   Eigen::ThreadPoolDevice device(&threads, num_threads);
388 
389   Tensor<float, 2, DataLayout> t_left(2, 10000);
390   Tensor<float, 2, DataLayout> t_right(10000, 10);
391   Tensor<float, 2, DataLayout> t_result(2, 10);
392 
393   t_left.setRandom();
394   t_right.setRandom();
395   // Put trash in t_result to verify contraction clears output memory.
396   t_result.setRandom();
397 
398   // Add a little offset so that the results won't be close to zero.
399   t_left += t_left.constant(1.0f);
400   t_right += t_right.constant(1.0f);
401 
402   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
403   MapXf m_left(t_left.data(), 2, 10000);
404   MapXf m_right(t_right.data(), 10000, 10);
405   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
406 
407   // this contraction should be equivalent to a single matrix multiplication
408   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
409 
410   // compute results by separate methods
411   t_result.device(device) = t_left.contract(t_right, dims);
412   m_result = m_left * m_right;
413 
414   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
415     VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
416   }
417 }
418 
419 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
420 template <int DataLayout>
test_sharded_by_inner_dim_contraction_with_output_kernel()421 static void test_sharded_by_inner_dim_contraction_with_output_kernel()
422 {
423   typedef Tensor<float, 1>::DimensionPair DimPair;
424 
425   const int num_threads = internal::random<int>(4, 16);
426   ThreadPool threads(num_threads);
427   Eigen::ThreadPoolDevice device(&threads, num_threads);
428 
429   Tensor<float, 2, DataLayout> t_left(2, 10000);
430   Tensor<float, 2, DataLayout> t_right(10000, 10);
431   Tensor<float, 2, DataLayout> t_result(2, 10);
432 
433   t_left.setRandom();
434   t_right.setRandom();
435   // Put trash in t_result to verify contraction clears output memory.
436   t_result.setRandom();
437 
438   // Add a little offset so that the results won't be close to zero.
439   t_left += t_left.constant(1.0f);
440   t_right += t_right.constant(1.0f);
441 
442   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
443   MapXf m_left(t_left.data(), 2, 10000);
444   MapXf m_right(t_right.data(), 10000, 10);
445   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
446 
447   // this contraction should be equivalent to a single matrix multiplication
448   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
449 
450   // compute results by separate methods
451   t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
452   m_result = m_left * m_right;
453 
454   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
455     VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
456   }
457 }
458 
459 // We are triggering 'evalShardedByInnerDim' optimization.
460 template <int DataLayout>
test_async_sharded_by_inner_dim_contraction()461 static void test_async_sharded_by_inner_dim_contraction()
462 {
463   typedef Tensor<float, 1>::DimensionPair DimPair;
464 
465   const int num_threads = internal::random<int>(4, 16);
466   ThreadPool threads(num_threads);
467   Eigen::ThreadPoolDevice device(&threads, num_threads);
468 
469   Tensor<float, 2, DataLayout> t_left(2, 10000);
470   Tensor<float, 2, DataLayout> t_right(10000, 10);
471   Tensor<float, 2, DataLayout> t_result(2, 10);
472 
473   t_left.setRandom();
474   t_right.setRandom();
475   // Put trash in t_result to verify contraction clears output memory.
476   t_result.setRandom();
477 
478   // Add a little offset so that the results won't be close to zero.
479   t_left += t_left.constant(1.0f);
480   t_right += t_right.constant(1.0f);
481 
482   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
483   MapXf m_left(t_left.data(), 2, 10000);
484   MapXf m_right(t_right.data(), 10000, 10);
485   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
486 
487   // this contraction should be equivalent to a single matrix multiplication
488   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
489 
490   // compute results by separate methods
491   Eigen::Barrier barrier(1);
492   t_result.device(device, [&barrier]() { barrier.Notify(); }) =
493       t_left.contract(t_right, dims);
494   barrier.Wait();
495 
496   m_result = m_left * m_right;
497 
498   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
499     VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
500   }
501 }
502 
503 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
504 template <int DataLayout>
test_async_sharded_by_inner_dim_contraction_with_output_kernel()505 static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
506 {
507   typedef Tensor<float, 1>::DimensionPair DimPair;
508 
509   const int num_threads = internal::random<int>(4, 16);
510   ThreadPool threads(num_threads);
511   Eigen::ThreadPoolDevice device(&threads, num_threads);
512 
513   Tensor<float, 2, DataLayout> t_left(2, 10000);
514   Tensor<float, 2, DataLayout> t_right(10000, 10);
515   Tensor<float, 2, DataLayout> t_result(2, 10);
516 
517   t_left.setRandom();
518   t_right.setRandom();
519   // Put trash in t_result to verify contraction clears output memory.
520   t_result.setRandom();
521 
522   // Add a little offset so that the results won't be close to zero.
523   t_left += t_left.constant(1.0f);
524   t_right += t_right.constant(1.0f);
525 
526   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
527   MapXf m_left(t_left.data(), 2, 10000);
528   MapXf m_right(t_right.data(), 10000, 10);
529   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
530 
531   // this contraction should be equivalent to a single matrix multiplication
532   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
533 
534   // compute results by separate methods
535   Eigen::Barrier barrier(1);
536   t_result.device(device, [&barrier]() { barrier.Notify(); }) =
537       t_left.contract(t_right, dims, SqrtOutputKernel());
538   barrier.Wait();
539   m_result = m_left * m_right;
540 
541   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
542     VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
543   }
544 }
545 
546 template<int DataLayout>
test_full_contraction()547 void test_full_contraction() {
548   int contract_size1 = internal::random<int>(1, 500);
549   int contract_size2 = internal::random<int>(1, 500);
550 
551   Tensor<float, 2, DataLayout> left(contract_size1,
552                                     contract_size2);
553   Tensor<float, 2, DataLayout> right(contract_size1,
554                                     contract_size2);
555   left.setRandom();
556   right.setRandom();
557 
558   // add constants to shift values away from 0 for more precision
559   left += left.constant(1.5f);
560   right += right.constant(1.5f);
561 
562   typedef Tensor<float, 2>::DimensionPair DimPair;
563   Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
564 
565   Eigen::ThreadPool tp(internal::random<int>(2, 11));
566   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
567 
568   Tensor<float, 0, DataLayout> st_result;
569   st_result = left.contract(right, dims);
570 
571   Tensor<float, 0, DataLayout> tp_result;
572   tp_result.device(thread_pool_device) = left.contract(right, dims);
573 
574   VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
575   // if both of the values are very small, then do nothing (because the test will fail
576   // due to numerical precision issues when values are small)
577   if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
578     VERIFY_IS_APPROX(st_result(), tp_result());
579   }
580 }
581 
582 template<int DataLayout>
test_multithreaded_reductions()583 void test_multithreaded_reductions() {
584   const int num_threads = internal::random<int>(3, 11);
585   ThreadPool thread_pool(num_threads);
586   Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
587 
588   const int num_rows = internal::random<int>(13, 732);
589   const int num_cols = internal::random<int>(13, 732);
590   Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
591   t1.setRandom();
592 
593   Tensor<float, 0, DataLayout> full_redux;
594   full_redux = t1.sum();
595 
596   Tensor<float, 0, DataLayout> full_redux_tp;
597   full_redux_tp.device(thread_pool_device) = t1.sum();
598 
599   // Check that the single threaded and the multi threaded reductions return
600   // the same result.
601   VERIFY_IS_APPROX(full_redux(), full_redux_tp());
602 }
603 
604 
test_memcpy()605 void test_memcpy() {
606 
607   for (int i = 0; i < 5; ++i) {
608     const int num_threads = internal::random<int>(3, 11);
609     Eigen::ThreadPool tp(num_threads);
610     Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
611 
612     const int size = internal::random<int>(13, 7632);
613     Tensor<float, 1> t1(size);
614     t1.setRandom();
615     std::vector<float> result(size);
616     thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
617     for (int j = 0; j < size; j++) {
618       VERIFY_IS_EQUAL(t1(j), result[j]);
619     }
620   }
621 }
622 
623 
test_multithread_random()624 void test_multithread_random()
625 {
626   Eigen::ThreadPool tp(2);
627   Eigen::ThreadPoolDevice device(&tp, 2);
628   Tensor<float, 1> t(1 << 20);
629   t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
630 }
631 
632 template<int DataLayout>
test_multithread_shuffle(Allocator * allocator)633 void test_multithread_shuffle(Allocator* allocator)
634 {
635   Tensor<float, 4, DataLayout> tensor(17,5,7,11);
636   tensor.setRandom();
637 
638   const int num_threads = internal::random<int>(2, 11);
639   ThreadPool threads(num_threads);
640   Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
641 
642   Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
643   array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
644   shuffle.device(device) = tensor.shuffle(shuffles);
645 
646   for (int i = 0; i < 17; ++i) {
647     for (int j = 0; j < 5; ++j) {
648       for (int k = 0; k < 7; ++k) {
649         for (int l = 0; l < 11; ++l) {
650           VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
651         }
652       }
653     }
654   }
655 }
656 
test_threadpool_allocate(TestAllocator * allocator)657 void test_threadpool_allocate(TestAllocator* allocator)
658 {
659   const int num_threads = internal::random<int>(2, 11);
660   const int num_allocs = internal::random<int>(2, 11);
661   ThreadPool threads(num_threads);
662   Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
663 
664   for (int a = 0; a < num_allocs; ++a) {
665     void* ptr = device.allocate(512);
666     device.deallocate(ptr);
667   }
668   VERIFY(allocator != NULL);
669   VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
670   VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
671 }
672 
EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)673 EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
674 {
675   CALL_SUBTEST_1(test_multithread_elementwise());
676   CALL_SUBTEST_1(test_async_multithread_elementwise());
677   CALL_SUBTEST_1(test_multithread_compound_assignment());
678 
679   CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
680   CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
681 
682   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
683   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
684   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
685   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
686 
687   CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
688   CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
689 
690   // Test EvalShardedByInnerDimContext parallelization strategy.
691   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
692   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
693   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
694   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
695 
696   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
697   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
698   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
699   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
700 
701   // Exercise various cases that have been problematic in the past.
702   CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
703   CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
704 
705   CALL_SUBTEST_8(test_full_contraction<ColMajor>());
706   CALL_SUBTEST_8(test_full_contraction<RowMajor>());
707 
708   CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
709   CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
710 
711   CALL_SUBTEST_10(test_memcpy());
712   CALL_SUBTEST_10(test_multithread_random());
713 
714   TestAllocator test_allocator;
715   CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
716   CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
717   CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
718 
719   // Force CMake to split this test.
720   // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
721 }
722