1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_USE_THREADS
11
12
13 #include "main.h"
14 #include <iostream>
15 #include <Eigen/CXX11/Tensor>
16
17 using Eigen::Tensor;
18
19 class TestAllocator : public Allocator {
20 public:
~TestAllocator()21 ~TestAllocator() EIGEN_OVERRIDE {}
allocate(size_t num_bytes) const22 EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
23 const_cast<TestAllocator*>(this)->alloc_count_++;
24 return internal::aligned_malloc(num_bytes);
25 }
deallocate(void * buffer) const26 EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
27 const_cast<TestAllocator*>(this)->dealloc_count_++;
28 internal::aligned_free(buffer);
29 }
30
alloc_count() const31 int alloc_count() const { return alloc_count_; }
dealloc_count() const32 int dealloc_count() const { return dealloc_count_; }
33
34 private:
35 int alloc_count_ = 0;
36 int dealloc_count_ = 0;
37 };
38
test_multithread_elementwise()39 void test_multithread_elementwise()
40 {
41 Tensor<float, 3> in1(200, 30, 70);
42 Tensor<float, 3> in2(200, 30, 70);
43 Tensor<double, 3> out(200, 30, 70);
44
45 in1.setRandom();
46 in2.setRandom();
47
48 Eigen::ThreadPool tp(internal::random<int>(3, 11));
49 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
50 out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
51
52 for (int i = 0; i < 200; ++i) {
53 for (int j = 0; j < 30; ++j) {
54 for (int k = 0; k < 70; ++k) {
55 VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
56 }
57 }
58 }
59 }
60
test_async_multithread_elementwise()61 void test_async_multithread_elementwise()
62 {
63 Tensor<float, 3> in1(200, 30, 70);
64 Tensor<float, 3> in2(200, 30, 70);
65 Tensor<double, 3> out(200, 30, 70);
66
67 in1.setRandom();
68 in2.setRandom();
69
70 Eigen::ThreadPool tp(internal::random<int>(3, 11));
71 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
72
73 Eigen::Barrier b(1);
74 out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
75 b.Wait();
76
77 for (int i = 0; i < 200; ++i) {
78 for (int j = 0; j < 30; ++j) {
79 for (int k = 0; k < 70; ++k) {
80 VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
81 }
82 }
83 }
84 }
85
test_multithread_compound_assignment()86 void test_multithread_compound_assignment()
87 {
88 Tensor<float, 3> in1(2,3,7);
89 Tensor<float, 3> in2(2,3,7);
90 Tensor<float, 3> out(2,3,7);
91
92 in1.setRandom();
93 in2.setRandom();
94
95 Eigen::ThreadPool tp(internal::random<int>(3, 11));
96 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
97 out.device(thread_pool_device) = in1;
98 out.device(thread_pool_device) += in2 * 3.14f;
99
100 for (int i = 0; i < 2; ++i) {
101 for (int j = 0; j < 3; ++j) {
102 for (int k = 0; k < 7; ++k) {
103 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
104 }
105 }
106 }
107 }
108
109 template<int DataLayout>
test_multithread_contraction()110 void test_multithread_contraction()
111 {
112 Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
113 Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
114 Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
115
116 t_left.setRandom();
117 t_right.setRandom();
118
119 // this contraction should be equivalent to a single matrix multiplication
120 typedef Tensor<float, 1>::DimensionPair DimPair;
121 Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
122
123 typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
124 MapXf m_left(t_left.data(), 1500, 1147);
125 MapXf m_right(t_right.data(), 1147, 1400);
126 Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
127
128 Eigen::ThreadPool tp(4);
129 Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
130
131 // compute results by separate methods
132 t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
133 m_result = m_left * m_right;
134
135 for (ptrdiff_t i = 0; i < t_result.size(); i++) {
136 VERIFY(&t_result.data()[i] != &m_result.data()[i]);
137 if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
138 continue;
139 }
140 if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
141 continue;
142 }
143 std::cout << "mismatch detected at index " << i << ": " << t_result(i)
144 << " vs " << m_result(i) << std::endl;
145 assert(false);
146 }
147 }
148
149 template<int DataLayout>
test_contraction_corner_cases()150 void test_contraction_corner_cases()
151 {
152 Tensor<float, 2, DataLayout> t_left(32, 500);
153 Tensor<float, 2, DataLayout> t_right(32, 28*28);
154 Tensor<float, 2, DataLayout> t_result(500, 28*28);
155
156 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
157 t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
158 t_result = t_result.constant(NAN);
159
160 // this contraction should be equivalent to a single matrix multiplication
161 typedef Tensor<float, 1>::DimensionPair DimPair;
162 Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
163
164 typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
165 MapXf m_left(t_left.data(), 32, 500);
166 MapXf m_right(t_right.data(), 32, 28*28);
167 Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
168
169 Eigen::ThreadPool tp(12);
170 Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
171
172 // compute results by separate methods
173 t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
174 m_result = m_left.transpose() * m_right;
175
176 for (ptrdiff_t i = 0; i < t_result.size(); i++) {
177 assert(!(numext::isnan)(t_result.data()[i]));
178 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
179 std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
180 assert(false);
181 }
182 }
183
184 t_left.resize(32, 1);
185 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
186 t_result.resize (1, 28*28);
187 t_result = t_result.constant(NAN);
188 t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
189 new(&m_left) MapXf(t_left.data(), 32, 1);
190 m_result = m_left.transpose() * m_right;
191 for (ptrdiff_t i = 0; i < t_result.size(); i++) {
192 assert(!(numext::isnan)(t_result.data()[i]));
193 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
194 std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
195 assert(false);
196 }
197 }
198
199 t_left.resize(32, 500);
200 t_right.resize(32, 4);
201 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
202 t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
203 t_result.resize (500, 4);
204 t_result = t_result.constant(NAN);
205 t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
206 new(&m_left) MapXf(t_left.data(), 32, 500);
207 new(&m_right) MapXf(t_right.data(), 32, 4);
208 m_result = m_left.transpose() * m_right;
209 for (ptrdiff_t i = 0; i < t_result.size(); i++) {
210 assert(!(numext::isnan)(t_result.data()[i]));
211 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
212 std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
213 assert(false);
214 }
215 }
216
217 t_left.resize(32, 1);
218 t_right.resize(32, 4);
219 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
220 t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
221 t_result.resize (1, 4);
222 t_result = t_result.constant(NAN);
223 t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
224 new(&m_left) MapXf(t_left.data(), 32, 1);
225 new(&m_right) MapXf(t_right.data(), 32, 4);
226 m_result = m_left.transpose() * m_right;
227 for (ptrdiff_t i = 0; i < t_result.size(); i++) {
228 assert(!(numext::isnan)(t_result.data()[i]));
229 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
230 std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
231 assert(false);
232 }
233 }
234 }
235
236 template<int DataLayout>
test_multithread_contraction_agrees_with_singlethread()237 void test_multithread_contraction_agrees_with_singlethread() {
238 int contract_size = internal::random<int>(1, 5000);
239
240 Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
241 contract_size,
242 internal::random<int>(1, 100));
243
244 Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
245 internal::random<int>(1, 37),
246 contract_size,
247 internal::random<int>(1, 51));
248
249 left.setRandom();
250 right.setRandom();
251
252 // add constants to shift values away from 0 for more precision
253 left += left.constant(1.5f);
254 right += right.constant(1.5f);
255
256 typedef Tensor<float, 1>::DimensionPair DimPair;
257 Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
258
259 Eigen::ThreadPool tp(internal::random<int>(2, 11));
260 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
261
262 Tensor<float, 5, DataLayout> st_result;
263 st_result = left.contract(right, dims);
264
265 Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
266 tp_result.device(thread_pool_device) = left.contract(right, dims);
267
268 VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
269 for (ptrdiff_t i = 0; i < st_result.size(); i++) {
270 // if both of the values are very small, then do nothing (because the test will fail
271 // due to numerical precision issues when values are small)
272 if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
273 VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
274 }
275 }
276 }
277
278 // Apply Sqrt to all output elements.
279 struct SqrtOutputKernel {
280 template <typename Index, typename Scalar>
operator ()SqrtOutputKernel281 EIGEN_ALWAYS_INLINE void operator()(
282 const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
283 const TensorContractionParams&, Index, Index, Index num_rows,
284 Index num_cols) const {
285 for (int i = 0; i < num_rows; ++i) {
286 for (int j = 0; j < num_cols; ++j) {
287 output_mapper(i, j) = std::sqrt(output_mapper(i, j));
288 }
289 }
290 }
291 };
292
293 template <int DataLayout>
test_multithread_contraction_with_output_kernel()294 static void test_multithread_contraction_with_output_kernel() {
295 typedef Tensor<float, 1>::DimensionPair DimPair;
296
297 const int num_threads = internal::random<int>(2, 11);
298 ThreadPool threads(num_threads);
299 Eigen::ThreadPoolDevice device(&threads, num_threads);
300
301 Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
302 Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
303 Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
304
305 t_left.setRandom();
306 t_right.setRandom();
307 // Put trash in mat4 to verify contraction clears output memory.
308 t_result.setRandom();
309
310 // Add a little offset so that the results won't be close to zero.
311 t_left += t_left.constant(1.0f);
312 t_right += t_right.constant(1.0f);
313
314 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
315 MapXf m_left(t_left.data(), 1500, 248);
316 MapXf m_right(t_right.data(), 248, 1400);
317 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
318
319 // this contraction should be equivalent to a single matrix multiplication
320 Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
321
322 // compute results by separate methods
323 t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
324
325 m_result = m_left * m_right;
326
327 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
328 VERIFY(&t_result.data()[i] != &m_result.data()[i]);
329 VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
330 }
331 }
332
333 template<int DataLayout>
test_async_multithread_contraction_agrees_with_singlethread()334 void test_async_multithread_contraction_agrees_with_singlethread()
335 {
336 int contract_size = internal::random<int>(100, 500);
337
338 Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
339 contract_size,
340 internal::random<int>(10, 40));
341
342 Tensor<float, 4, DataLayout> right(
343 internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
344 internal::random<int>(1, 20));
345
346 left.setRandom();
347 right.setRandom();
348
349 // add constants to shift values away from 0 for more precision
350 left += left.constant(1.5f);
351 right += right.constant(1.5f);
352
353 typedef Tensor<float, 1>::DimensionPair DimPair;
354 Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
355
356 Eigen::ThreadPool tp(internal::random<int>(2, 11));
357 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
358
359 Tensor<float, 5, DataLayout> st_result;
360 st_result = left.contract(right, dims);
361
362 Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
363
364 Eigen::Barrier barrier(1);
365 tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
366 left.contract(right, dims);
367 barrier.Wait();
368
369 VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
370 for (ptrdiff_t i = 0; i < st_result.size(); i++) {
371 // if both of the values are very small, then do nothing (because the test
372 // will fail due to numerical precision issues when values are small)
373 if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
374 VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
375 }
376 }
377 }
378
379 // We are triggering 'evalShardedByInnerDim' optimization.
380 template <int DataLayout>
test_sharded_by_inner_dim_contraction()381 static void test_sharded_by_inner_dim_contraction()
382 {
383 typedef Tensor<float, 1>::DimensionPair DimPair;
384
385 const int num_threads = internal::random<int>(4, 16);
386 ThreadPool threads(num_threads);
387 Eigen::ThreadPoolDevice device(&threads, num_threads);
388
389 Tensor<float, 2, DataLayout> t_left(2, 10000);
390 Tensor<float, 2, DataLayout> t_right(10000, 10);
391 Tensor<float, 2, DataLayout> t_result(2, 10);
392
393 t_left.setRandom();
394 t_right.setRandom();
395 // Put trash in t_result to verify contraction clears output memory.
396 t_result.setRandom();
397
398 // Add a little offset so that the results won't be close to zero.
399 t_left += t_left.constant(1.0f);
400 t_right += t_right.constant(1.0f);
401
402 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
403 MapXf m_left(t_left.data(), 2, 10000);
404 MapXf m_right(t_right.data(), 10000, 10);
405 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
406
407 // this contraction should be equivalent to a single matrix multiplication
408 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
409
410 // compute results by separate methods
411 t_result.device(device) = t_left.contract(t_right, dims);
412 m_result = m_left * m_right;
413
414 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
415 VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
416 }
417 }
418
419 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
420 template <int DataLayout>
test_sharded_by_inner_dim_contraction_with_output_kernel()421 static void test_sharded_by_inner_dim_contraction_with_output_kernel()
422 {
423 typedef Tensor<float, 1>::DimensionPair DimPair;
424
425 const int num_threads = internal::random<int>(4, 16);
426 ThreadPool threads(num_threads);
427 Eigen::ThreadPoolDevice device(&threads, num_threads);
428
429 Tensor<float, 2, DataLayout> t_left(2, 10000);
430 Tensor<float, 2, DataLayout> t_right(10000, 10);
431 Tensor<float, 2, DataLayout> t_result(2, 10);
432
433 t_left.setRandom();
434 t_right.setRandom();
435 // Put trash in t_result to verify contraction clears output memory.
436 t_result.setRandom();
437
438 // Add a little offset so that the results won't be close to zero.
439 t_left += t_left.constant(1.0f);
440 t_right += t_right.constant(1.0f);
441
442 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
443 MapXf m_left(t_left.data(), 2, 10000);
444 MapXf m_right(t_right.data(), 10000, 10);
445 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
446
447 // this contraction should be equivalent to a single matrix multiplication
448 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
449
450 // compute results by separate methods
451 t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
452 m_result = m_left * m_right;
453
454 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
455 VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
456 }
457 }
458
459 // We are triggering 'evalShardedByInnerDim' optimization.
460 template <int DataLayout>
test_async_sharded_by_inner_dim_contraction()461 static void test_async_sharded_by_inner_dim_contraction()
462 {
463 typedef Tensor<float, 1>::DimensionPair DimPair;
464
465 const int num_threads = internal::random<int>(4, 16);
466 ThreadPool threads(num_threads);
467 Eigen::ThreadPoolDevice device(&threads, num_threads);
468
469 Tensor<float, 2, DataLayout> t_left(2, 10000);
470 Tensor<float, 2, DataLayout> t_right(10000, 10);
471 Tensor<float, 2, DataLayout> t_result(2, 10);
472
473 t_left.setRandom();
474 t_right.setRandom();
475 // Put trash in t_result to verify contraction clears output memory.
476 t_result.setRandom();
477
478 // Add a little offset so that the results won't be close to zero.
479 t_left += t_left.constant(1.0f);
480 t_right += t_right.constant(1.0f);
481
482 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
483 MapXf m_left(t_left.data(), 2, 10000);
484 MapXf m_right(t_right.data(), 10000, 10);
485 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
486
487 // this contraction should be equivalent to a single matrix multiplication
488 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
489
490 // compute results by separate methods
491 Eigen::Barrier barrier(1);
492 t_result.device(device, [&barrier]() { barrier.Notify(); }) =
493 t_left.contract(t_right, dims);
494 barrier.Wait();
495
496 m_result = m_left * m_right;
497
498 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
499 VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
500 }
501 }
502
503 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
504 template <int DataLayout>
test_async_sharded_by_inner_dim_contraction_with_output_kernel()505 static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
506 {
507 typedef Tensor<float, 1>::DimensionPair DimPair;
508
509 const int num_threads = internal::random<int>(4, 16);
510 ThreadPool threads(num_threads);
511 Eigen::ThreadPoolDevice device(&threads, num_threads);
512
513 Tensor<float, 2, DataLayout> t_left(2, 10000);
514 Tensor<float, 2, DataLayout> t_right(10000, 10);
515 Tensor<float, 2, DataLayout> t_result(2, 10);
516
517 t_left.setRandom();
518 t_right.setRandom();
519 // Put trash in t_result to verify contraction clears output memory.
520 t_result.setRandom();
521
522 // Add a little offset so that the results won't be close to zero.
523 t_left += t_left.constant(1.0f);
524 t_right += t_right.constant(1.0f);
525
526 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
527 MapXf m_left(t_left.data(), 2, 10000);
528 MapXf m_right(t_right.data(), 10000, 10);
529 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
530
531 // this contraction should be equivalent to a single matrix multiplication
532 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
533
534 // compute results by separate methods
535 Eigen::Barrier barrier(1);
536 t_result.device(device, [&barrier]() { barrier.Notify(); }) =
537 t_left.contract(t_right, dims, SqrtOutputKernel());
538 barrier.Wait();
539 m_result = m_left * m_right;
540
541 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
542 VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
543 }
544 }
545
546 template<int DataLayout>
test_full_contraction()547 void test_full_contraction() {
548 int contract_size1 = internal::random<int>(1, 500);
549 int contract_size2 = internal::random<int>(1, 500);
550
551 Tensor<float, 2, DataLayout> left(contract_size1,
552 contract_size2);
553 Tensor<float, 2, DataLayout> right(contract_size1,
554 contract_size2);
555 left.setRandom();
556 right.setRandom();
557
558 // add constants to shift values away from 0 for more precision
559 left += left.constant(1.5f);
560 right += right.constant(1.5f);
561
562 typedef Tensor<float, 2>::DimensionPair DimPair;
563 Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
564
565 Eigen::ThreadPool tp(internal::random<int>(2, 11));
566 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
567
568 Tensor<float, 0, DataLayout> st_result;
569 st_result = left.contract(right, dims);
570
571 Tensor<float, 0, DataLayout> tp_result;
572 tp_result.device(thread_pool_device) = left.contract(right, dims);
573
574 VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
575 // if both of the values are very small, then do nothing (because the test will fail
576 // due to numerical precision issues when values are small)
577 if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
578 VERIFY_IS_APPROX(st_result(), tp_result());
579 }
580 }
581
582 template<int DataLayout>
test_multithreaded_reductions()583 void test_multithreaded_reductions() {
584 const int num_threads = internal::random<int>(3, 11);
585 ThreadPool thread_pool(num_threads);
586 Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
587
588 const int num_rows = internal::random<int>(13, 732);
589 const int num_cols = internal::random<int>(13, 732);
590 Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
591 t1.setRandom();
592
593 Tensor<float, 0, DataLayout> full_redux;
594 full_redux = t1.sum();
595
596 Tensor<float, 0, DataLayout> full_redux_tp;
597 full_redux_tp.device(thread_pool_device) = t1.sum();
598
599 // Check that the single threaded and the multi threaded reductions return
600 // the same result.
601 VERIFY_IS_APPROX(full_redux(), full_redux_tp());
602 }
603
604
test_memcpy()605 void test_memcpy() {
606
607 for (int i = 0; i < 5; ++i) {
608 const int num_threads = internal::random<int>(3, 11);
609 Eigen::ThreadPool tp(num_threads);
610 Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
611
612 const int size = internal::random<int>(13, 7632);
613 Tensor<float, 1> t1(size);
614 t1.setRandom();
615 std::vector<float> result(size);
616 thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
617 for (int j = 0; j < size; j++) {
618 VERIFY_IS_EQUAL(t1(j), result[j]);
619 }
620 }
621 }
622
623
test_multithread_random()624 void test_multithread_random()
625 {
626 Eigen::ThreadPool tp(2);
627 Eigen::ThreadPoolDevice device(&tp, 2);
628 Tensor<float, 1> t(1 << 20);
629 t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
630 }
631
632 template<int DataLayout>
test_multithread_shuffle(Allocator * allocator)633 void test_multithread_shuffle(Allocator* allocator)
634 {
635 Tensor<float, 4, DataLayout> tensor(17,5,7,11);
636 tensor.setRandom();
637
638 const int num_threads = internal::random<int>(2, 11);
639 ThreadPool threads(num_threads);
640 Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
641
642 Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
643 array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
644 shuffle.device(device) = tensor.shuffle(shuffles);
645
646 for (int i = 0; i < 17; ++i) {
647 for (int j = 0; j < 5; ++j) {
648 for (int k = 0; k < 7; ++k) {
649 for (int l = 0; l < 11; ++l) {
650 VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
651 }
652 }
653 }
654 }
655 }
656
test_threadpool_allocate(TestAllocator * allocator)657 void test_threadpool_allocate(TestAllocator* allocator)
658 {
659 const int num_threads = internal::random<int>(2, 11);
660 const int num_allocs = internal::random<int>(2, 11);
661 ThreadPool threads(num_threads);
662 Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
663
664 for (int a = 0; a < num_allocs; ++a) {
665 void* ptr = device.allocate(512);
666 device.deallocate(ptr);
667 }
668 VERIFY(allocator != NULL);
669 VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
670 VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
671 }
672
EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)673 EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
674 {
675 CALL_SUBTEST_1(test_multithread_elementwise());
676 CALL_SUBTEST_1(test_async_multithread_elementwise());
677 CALL_SUBTEST_1(test_multithread_compound_assignment());
678
679 CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
680 CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
681
682 CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
683 CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
684 CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
685 CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
686
687 CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
688 CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
689
690 // Test EvalShardedByInnerDimContext parallelization strategy.
691 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
692 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
693 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
694 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
695
696 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
697 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
698 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
699 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
700
701 // Exercise various cases that have been problematic in the past.
702 CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
703 CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
704
705 CALL_SUBTEST_8(test_full_contraction<ColMajor>());
706 CALL_SUBTEST_8(test_full_contraction<RowMajor>());
707
708 CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
709 CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
710
711 CALL_SUBTEST_10(test_memcpy());
712 CALL_SUBTEST_10(test_multithread_random());
713
714 TestAllocator test_allocator;
715 CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
716 CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
717 CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
718
719 // Force CMake to split this test.
720 // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
721 }
722