• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <gtest/gtest.h>
2 
3 #include <pthreadpool.h>
4 
5 #include <algorithm>
6 #include <atomic>
7 #include <cstddef>
8 #include <memory>
9 
10 
11 typedef std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> auto_pthreadpool_t;
12 
13 
14 const size_t kParallelize1DRange = 1223;
15 const size_t kParallelize1DTile1DRange = 1303;
16 const size_t kParallelize1DTile1DTile = 11;
17 const size_t kParallelize2DRangeI = 41;
18 const size_t kParallelize2DRangeJ = 43;
19 const size_t kParallelize2DTile1DRangeI = 43;
20 const size_t kParallelize2DTile1DRangeJ = 53;
21 const size_t kParallelize2DTile1DTileJ = 5;
22 const size_t kParallelize2DTile2DRangeI = 53;
23 const size_t kParallelize2DTile2DRangeJ = 59;
24 const size_t kParallelize2DTile2DTileI = 5;
25 const size_t kParallelize2DTile2DTileJ = 7;
26 const size_t kParallelize3DRangeI = 13;
27 const size_t kParallelize3DRangeJ = 17;
28 const size_t kParallelize3DRangeK = 19;
29 const size_t kParallelize3DTile1DRangeI = 17;
30 const size_t kParallelize3DTile1DRangeJ = 19;
31 const size_t kParallelize3DTile1DRangeK = 23;
32 const size_t kParallelize3DTile1DTileK = 5;
33 const size_t kParallelize3DTile2DRangeI = 19;
34 const size_t kParallelize3DTile2DRangeJ = 23;
35 const size_t kParallelize3DTile2DRangeK = 29;
36 const size_t kParallelize3DTile2DTileJ = 2;
37 const size_t kParallelize3DTile2DTileK = 3;
38 const size_t kParallelize4DRangeI = 11;
39 const size_t kParallelize4DRangeJ = 13;
40 const size_t kParallelize4DRangeK = 17;
41 const size_t kParallelize4DRangeL = 19;
42 const size_t kParallelize4DTile1DRangeI = 13;
43 const size_t kParallelize4DTile1DRangeJ = 17;
44 const size_t kParallelize4DTile1DRangeK = 19;
45 const size_t kParallelize4DTile1DRangeL = 23;
46 const size_t kParallelize4DTile1DTileL = 5;
47 const size_t kParallelize4DTile2DRangeI = 17;
48 const size_t kParallelize4DTile2DRangeJ = 19;
49 const size_t kParallelize4DTile2DRangeK = 23;
50 const size_t kParallelize4DTile2DRangeL = 29;
51 const size_t kParallelize4DTile2DTileK = 2;
52 const size_t kParallelize4DTile2DTileL = 3;
53 const size_t kParallelize5DRangeI = 7;
54 const size_t kParallelize5DRangeJ = 11;
55 const size_t kParallelize5DRangeK = 13;
56 const size_t kParallelize5DRangeL = 17;
57 const size_t kParallelize5DRangeM = 19;
58 const size_t kParallelize5DTile1DRangeI = 11;
59 const size_t kParallelize5DTile1DRangeJ = 13;
60 const size_t kParallelize5DTile1DRangeK = 17;
61 const size_t kParallelize5DTile1DRangeL = 19;
62 const size_t kParallelize5DTile1DRangeM = 23;
63 const size_t kParallelize5DTile1DTileM = 5;
64 const size_t kParallelize5DTile2DRangeI = 13;
65 const size_t kParallelize5DTile2DRangeJ = 17;
66 const size_t kParallelize5DTile2DRangeK = 19;
67 const size_t kParallelize5DTile2DRangeL = 23;
68 const size_t kParallelize5DTile2DRangeM = 29;
69 const size_t kParallelize5DTile2DTileL = 3;
70 const size_t kParallelize5DTile2DTileM = 2;
71 const size_t kParallelize6DRangeI = 3;
72 const size_t kParallelize6DRangeJ = 5;
73 const size_t kParallelize6DRangeK = 7;
74 const size_t kParallelize6DRangeL = 11;
75 const size_t kParallelize6DRangeM = 13;
76 const size_t kParallelize6DRangeN = 17;
77 const size_t kParallelize6DTile1DRangeI = 5;
78 const size_t kParallelize6DTile1DRangeJ = 7;
79 const size_t kParallelize6DTile1DRangeK = 11;
80 const size_t kParallelize6DTile1DRangeL = 13;
81 const size_t kParallelize6DTile1DRangeM = 17;
82 const size_t kParallelize6DTile1DRangeN = 19;
83 const size_t kParallelize6DTile1DTileN = 5;
84 const size_t kParallelize6DTile2DRangeI = 7;
85 const size_t kParallelize6DTile2DRangeJ = 11;
86 const size_t kParallelize6DTile2DRangeK = 13;
87 const size_t kParallelize6DTile2DRangeL = 17;
88 const size_t kParallelize6DTile2DRangeM = 19;
89 const size_t kParallelize6DTile2DRangeN = 23;
90 const size_t kParallelize6DTile2DTileM = 3;
91 const size_t kParallelize6DTile2DTileN = 2;
92 
93 const size_t kIncrementIterations = 101;
94 const size_t kIncrementIterations5D = 7;
95 const size_t kIncrementIterations6D = 3;
96 
97 const uint32_t kMaxUArchIndex = 0;
98 const uint32_t kDefaultUArchIndex = 42;
99 
100 
TEST(CreateAndDestroy,NullThreadPool)101 TEST(CreateAndDestroy, NullThreadPool) {
102 	pthreadpool* threadpool = nullptr;
103 	pthreadpool_destroy(threadpool);
104 }
105 
TEST(CreateAndDestroy,SingleThreadPool)106 TEST(CreateAndDestroy, SingleThreadPool) {
107 	pthreadpool* threadpool = pthreadpool_create(1);
108 	ASSERT_TRUE(threadpool);
109 	pthreadpool_destroy(threadpool);
110 }
111 
TEST(CreateAndDestroy,MultiThreadPool)112 TEST(CreateAndDestroy, MultiThreadPool) {
113 	pthreadpool* threadpool = pthreadpool_create(0);
114 	ASSERT_TRUE(threadpool);
115 	pthreadpool_destroy(threadpool);
116 }
117 
ComputeNothing1D(void *,size_t)118 static void ComputeNothing1D(void*, size_t) {
119 }
120 
TEST(Parallelize1D,SingleThreadPoolCompletes)121 TEST(Parallelize1D, SingleThreadPoolCompletes) {
122 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
123 	ASSERT_TRUE(threadpool.get());
124 
125 	pthreadpool_parallelize_1d(threadpool.get(),
126 		ComputeNothing1D,
127 		nullptr,
128 		kParallelize1DRange,
129 		0 /* flags */);
130 }
131 
TEST(Parallelize1D,MultiThreadPoolCompletes)132 TEST(Parallelize1D, MultiThreadPoolCompletes) {
133 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
134 	ASSERT_TRUE(threadpool.get());
135 
136 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
137 		GTEST_SKIP();
138 	}
139 
140 	pthreadpool_parallelize_1d(
141 		threadpool.get(),
142 		ComputeNothing1D,
143 		nullptr,
144 		kParallelize1DRange,
145 		0 /* flags */);
146 }
147 
CheckBounds1D(void *,size_t i)148 static void CheckBounds1D(void*, size_t i) {
149 	EXPECT_LT(i, kParallelize1DRange);
150 }
151 
TEST(Parallelize1D,SingleThreadPoolAllItemsInBounds)152 TEST(Parallelize1D, SingleThreadPoolAllItemsInBounds) {
153 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
154 	ASSERT_TRUE(threadpool.get());
155 
156 	pthreadpool_parallelize_1d(
157 		threadpool.get(),
158 		CheckBounds1D,
159 		nullptr,
160 		kParallelize1DRange,
161 		0 /* flags */);
162 }
163 
TEST(Parallelize1D,MultiThreadPoolAllItemsInBounds)164 TEST(Parallelize1D, MultiThreadPoolAllItemsInBounds) {
165 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
166 	ASSERT_TRUE(threadpool.get());
167 
168 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
169 		GTEST_SKIP();
170 	}
171 
172 	pthreadpool_parallelize_1d(
173 		threadpool.get(),
174 		CheckBounds1D,
175 		nullptr,
176 		kParallelize1DRange,
177 		0 /* flags */);
178 }
179 
SetTrue1D(std::atomic_bool * processed_indicators,size_t i)180 static void SetTrue1D(std::atomic_bool* processed_indicators, size_t i) {
181 	processed_indicators[i].store(true, std::memory_order_relaxed);
182 }
183 
TEST(Parallelize1D,SingleThreadPoolAllItemsProcessed)184 TEST(Parallelize1D, SingleThreadPoolAllItemsProcessed) {
185 	std::vector<std::atomic_bool> indicators(kParallelize1DRange);
186 
187 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
188 	ASSERT_TRUE(threadpool.get());
189 
190 	pthreadpool_parallelize_1d(
191 		threadpool.get(),
192 		reinterpret_cast<pthreadpool_task_1d_t>(SetTrue1D),
193 		static_cast<void*>(indicators.data()),
194 		kParallelize1DRange,
195 		0 /* flags */);
196 
197 	for (size_t i = 0; i < kParallelize1DRange; i++) {
198 		EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
199 			<< "Element " << i << " not processed";
200 	}
201 }
202 
TEST(Parallelize1D,MultiThreadPoolAllItemsProcessed)203 TEST(Parallelize1D, MultiThreadPoolAllItemsProcessed) {
204 	std::vector<std::atomic_bool> indicators(kParallelize1DRange);
205 
206 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
207 	ASSERT_TRUE(threadpool.get());
208 
209 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
210 		GTEST_SKIP();
211 	}
212 
213 	pthreadpool_parallelize_1d(
214 		threadpool.get(),
215 		reinterpret_cast<pthreadpool_task_1d_t>(SetTrue1D),
216 		static_cast<void*>(indicators.data()),
217 		kParallelize1DRange,
218 		0 /* flags */);
219 
220 	for (size_t i = 0; i < kParallelize1DRange; i++) {
221 		EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
222 			<< "Element " << i << " not processed";
223 	}
224 }
225 
Increment1D(std::atomic_int * processed_counters,size_t i)226 static void Increment1D(std::atomic_int* processed_counters, size_t i) {
227 	processed_counters[i].fetch_add(1, std::memory_order_relaxed);
228 }
229 
TEST(Parallelize1D,SingleThreadPoolEachItemProcessedOnce)230 TEST(Parallelize1D, SingleThreadPoolEachItemProcessedOnce) {
231 	std::vector<std::atomic_int> counters(kParallelize1DRange);
232 
233 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
234 	ASSERT_TRUE(threadpool.get());
235 
236 	pthreadpool_parallelize_1d(
237 		threadpool.get(),
238 		reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
239 		static_cast<void*>(counters.data()),
240 		kParallelize1DRange,
241 		0 /* flags */);
242 
243 	for (size_t i = 0; i < kParallelize1DRange; i++) {
244 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
245 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
246 	}
247 }
248 
TEST(Parallelize1D,MultiThreadPoolEachItemProcessedOnce)249 TEST(Parallelize1D, MultiThreadPoolEachItemProcessedOnce) {
250 	std::vector<std::atomic_int> counters(kParallelize1DRange);
251 
252 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
253 	ASSERT_TRUE(threadpool.get());
254 
255 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
256 		GTEST_SKIP();
257 	}
258 
259 	pthreadpool_parallelize_1d(
260 		threadpool.get(),
261 		reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
262 		static_cast<void*>(counters.data()),
263 		kParallelize1DRange,
264 		0 /* flags */);
265 
266 	for (size_t i = 0; i < kParallelize1DRange; i++) {
267 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
268 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
269 	}
270 }
271 
TEST(Parallelize1D,SingleThreadPoolEachItemProcessedMultipleTimes)272 TEST(Parallelize1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
273 	std::vector<std::atomic_int> counters(kParallelize1DRange);
274 
275 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
276 	ASSERT_TRUE(threadpool.get());
277 
278 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
279 		pthreadpool_parallelize_1d(
280 			threadpool.get(),
281 			reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
282 			static_cast<void*>(counters.data()),
283 			kParallelize1DRange,
284 			0 /* flags */);
285 	}
286 
287 	for (size_t i = 0; i < kParallelize1DRange; i++) {
288 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
289 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
290 			<< "(expected: " << kIncrementIterations << ")";
291 	}
292 }
293 
TEST(Parallelize1D,MultiThreadPoolEachItemProcessedMultipleTimes)294 TEST(Parallelize1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
295 	std::vector<std::atomic_int> counters(kParallelize1DRange);
296 
297 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
298 	ASSERT_TRUE(threadpool.get());
299 
300 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
301 		GTEST_SKIP();
302 	}
303 
304 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
305 		pthreadpool_parallelize_1d(
306 			threadpool.get(),
307 			reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
308 			static_cast<void*>(counters.data()),
309 			kParallelize1DRange,
310 			0 /* flags */);
311 	}
312 
313 	for (size_t i = 0; i < kParallelize1DRange; i++) {
314 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
315 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
316 			<< "(expected: " << kIncrementIterations << ")";
317 	}
318 }
319 
IncrementSame1D(std::atomic_int * num_processed_items,size_t i)320 static void IncrementSame1D(std::atomic_int* num_processed_items, size_t i) {
321 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
322 }
323 
TEST(Parallelize1D,MultiThreadPoolHighContention)324 TEST(Parallelize1D, MultiThreadPoolHighContention) {
325 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
326 
327 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
328 	ASSERT_TRUE(threadpool.get());
329 
330 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
331 		GTEST_SKIP();
332 	}
333 
334 	pthreadpool_parallelize_1d(
335 		threadpool.get(),
336 		reinterpret_cast<pthreadpool_task_1d_t>(IncrementSame1D),
337 		static_cast<void*>(&num_processed_items),
338 		kParallelize1DRange,
339 		0 /* flags */);
340 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
341 }
342 
WorkImbalance1D(std::atomic_int * num_processed_items,size_t i)343 static void WorkImbalance1D(std::atomic_int* num_processed_items, size_t i) {
344 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
345 	if (i == 0) {
346 		/* Spin-wait until all items are computed */
347 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) {
348 			std::atomic_thread_fence(std::memory_order_acquire);
349 		}
350 	}
351 }
352 
TEST(Parallelize1D,MultiThreadPoolWorkStealing)353 TEST(Parallelize1D, MultiThreadPoolWorkStealing) {
354 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
355 
356 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
357 	ASSERT_TRUE(threadpool.get());
358 
359 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
360 		GTEST_SKIP();
361 	}
362 
363 	pthreadpool_parallelize_1d(
364 		threadpool.get(),
365 		reinterpret_cast<pthreadpool_task_1d_t>(WorkImbalance1D),
366 		static_cast<void*>(&num_processed_items),
367 		kParallelize1DRange,
368 		0 /* flags */);
369 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
370 }
371 
ComputeNothing1DWithUArch(void *,uint32_t,size_t)372 static void ComputeNothing1DWithUArch(void*, uint32_t, size_t) {
373 }
374 
TEST(Parallelize1DWithUArch,SingleThreadPoolCompletes)375 TEST(Parallelize1DWithUArch, SingleThreadPoolCompletes) {
376 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
377 	ASSERT_TRUE(threadpool.get());
378 
379 	pthreadpool_parallelize_1d_with_uarch(threadpool.get(),
380 		ComputeNothing1DWithUArch,
381 		nullptr,
382 		kDefaultUArchIndex,
383 		kMaxUArchIndex,
384 		kParallelize1DRange,
385 		0 /* flags */);
386 }
387 
TEST(Parallelize1DWithUArch,MultiThreadPoolCompletes)388 TEST(Parallelize1DWithUArch, MultiThreadPoolCompletes) {
389 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
390 	ASSERT_TRUE(threadpool.get());
391 
392 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
393 		GTEST_SKIP();
394 	}
395 
396 	pthreadpool_parallelize_1d_with_uarch(
397 		threadpool.get(),
398 		ComputeNothing1DWithUArch,
399 		nullptr,
400 		kDefaultUArchIndex,
401 		kMaxUArchIndex,
402 		kParallelize1DRange,
403 		0 /* flags */);
404 }
405 
CheckUArch1DWithUArch(void *,uint32_t uarch_index,size_t)406 static void CheckUArch1DWithUArch(void*, uint32_t uarch_index, size_t) {
407 	if (uarch_index != kDefaultUArchIndex) {
408 		EXPECT_LE(uarch_index, kMaxUArchIndex);
409 	}
410 }
411 
TEST(Parallelize1DWithUArch,SingleThreadPoolUArchInBounds)412 TEST(Parallelize1DWithUArch, SingleThreadPoolUArchInBounds) {
413 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
414 	ASSERT_TRUE(threadpool.get());
415 
416 	pthreadpool_parallelize_1d_with_uarch(threadpool.get(),
417 		CheckUArch1DWithUArch,
418 		nullptr,
419 		kDefaultUArchIndex,
420 		kMaxUArchIndex,
421 		kParallelize1DRange,
422 		0 /* flags */);
423 }
424 
TEST(Parallelize1DWithUArch,MultiThreadPoolUArchInBounds)425 TEST(Parallelize1DWithUArch, MultiThreadPoolUArchInBounds) {
426 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
427 	ASSERT_TRUE(threadpool.get());
428 
429 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
430 		GTEST_SKIP();
431 	}
432 
433 	pthreadpool_parallelize_1d_with_uarch(
434 		threadpool.get(),
435 		CheckUArch1DWithUArch,
436 		nullptr,
437 		kDefaultUArchIndex,
438 		kMaxUArchIndex,
439 		kParallelize1DRange,
440 		0 /* flags */);
441 }
442 
CheckBounds1DWithUArch(void *,uint32_t,size_t i)443 static void CheckBounds1DWithUArch(void*, uint32_t, size_t i) {
444 	EXPECT_LT(i, kParallelize1DRange);
445 }
446 
TEST(Parallelize1DWithUArch,SingleThreadPoolAllItemsInBounds)447 TEST(Parallelize1DWithUArch, SingleThreadPoolAllItemsInBounds) {
448 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
449 	ASSERT_TRUE(threadpool.get());
450 
451 	pthreadpool_parallelize_1d_with_uarch(
452 		threadpool.get(),
453 		CheckBounds1DWithUArch,
454 		nullptr,
455 		kDefaultUArchIndex,
456 		kMaxUArchIndex,
457 		kParallelize1DRange,
458 		0 /* flags */);
459 }
460 
TEST(Parallelize1DWithUArch,MultiThreadPoolAllItemsInBounds)461 TEST(Parallelize1DWithUArch, MultiThreadPoolAllItemsInBounds) {
462 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
463 	ASSERT_TRUE(threadpool.get());
464 
465 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
466 		GTEST_SKIP();
467 	}
468 
469 	pthreadpool_parallelize_1d_with_uarch(
470 		threadpool.get(),
471 		CheckBounds1DWithUArch,
472 		nullptr,
473 		kDefaultUArchIndex,
474 		kMaxUArchIndex,
475 		kParallelize1DRange,
476 		0 /* flags */);
477 }
478 
SetTrue1DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t i)479 static void SetTrue1DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i) {
480 	processed_indicators[i].store(true, std::memory_order_relaxed);
481 }
482 
TEST(Parallelize1DWithUArch,SingleThreadPoolAllItemsProcessed)483 TEST(Parallelize1DWithUArch, SingleThreadPoolAllItemsProcessed) {
484 	std::vector<std::atomic_bool> indicators(kParallelize1DRange);
485 
486 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
487 	ASSERT_TRUE(threadpool.get());
488 
489 	pthreadpool_parallelize_1d_with_uarch(
490 		threadpool.get(),
491 		reinterpret_cast<pthreadpool_task_1d_with_id_t>(SetTrue1DWithUArch),
492 		static_cast<void*>(indicators.data()),
493 		kDefaultUArchIndex,
494 		kMaxUArchIndex,
495 		kParallelize1DRange,
496 		0 /* flags */);
497 
498 	for (size_t i = 0; i < kParallelize1DRange; i++) {
499 		EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
500 			<< "Element " << i << " not processed";
501 	}
502 }
503 
TEST(Parallelize1DWithUArch,MultiThreadPoolAllItemsProcessed)504 TEST(Parallelize1DWithUArch, MultiThreadPoolAllItemsProcessed) {
505 	std::vector<std::atomic_bool> indicators(kParallelize1DRange);
506 
507 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
508 	ASSERT_TRUE(threadpool.get());
509 
510 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
511 		GTEST_SKIP();
512 	}
513 
514 	pthreadpool_parallelize_1d_with_uarch(
515 		threadpool.get(),
516 		reinterpret_cast<pthreadpool_task_1d_with_id_t>(SetTrue1DWithUArch),
517 		static_cast<void*>(indicators.data()),
518 		kDefaultUArchIndex,
519 		kMaxUArchIndex,
520 		kParallelize1DRange,
521 		0 /* flags */);
522 
523 	for (size_t i = 0; i < kParallelize1DRange; i++) {
524 		EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
525 			<< "Element " << i << " not processed";
526 	}
527 }
528 
Increment1DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t i)529 static void Increment1DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i) {
530 	processed_counters[i].fetch_add(1, std::memory_order_relaxed);
531 }
532 
TEST(Parallelize1DWithUArch,SingleThreadPoolEachItemProcessedOnce)533 TEST(Parallelize1DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
534 	std::vector<std::atomic_int> counters(kParallelize1DRange);
535 
536 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
537 	ASSERT_TRUE(threadpool.get());
538 
539 	pthreadpool_parallelize_1d_with_uarch(
540 		threadpool.get(),
541 		reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
542 		static_cast<void*>(counters.data()),
543 		kDefaultUArchIndex,
544 		kMaxUArchIndex,
545 		kParallelize1DRange,
546 		0 /* flags */);
547 
548 	for (size_t i = 0; i < kParallelize1DRange; i++) {
549 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
550 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
551 	}
552 }
553 
TEST(Parallelize1DWithUArch,MultiThreadPoolEachItemProcessedOnce)554 TEST(Parallelize1DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
555 	std::vector<std::atomic_int> counters(kParallelize1DRange);
556 
557 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
558 	ASSERT_TRUE(threadpool.get());
559 
560 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
561 		GTEST_SKIP();
562 	}
563 
564 	pthreadpool_parallelize_1d_with_uarch(
565 		threadpool.get(),
566 		reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
567 		static_cast<void*>(counters.data()),
568 		kDefaultUArchIndex,
569 		kMaxUArchIndex,
570 		kParallelize1DRange,
571 		0 /* flags */);
572 
573 	for (size_t i = 0; i < kParallelize1DRange; i++) {
574 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
575 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
576 	}
577 }
578 
TEST(Parallelize1DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)579 TEST(Parallelize1DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
580 	std::vector<std::atomic_int> counters(kParallelize1DRange);
581 
582 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
583 	ASSERT_TRUE(threadpool.get());
584 
585 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
586 		pthreadpool_parallelize_1d_with_uarch(
587 			threadpool.get(),
588 			reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
589 			static_cast<void*>(counters.data()),
590 			kDefaultUArchIndex,
591 			kMaxUArchIndex,
592 			kParallelize1DRange,
593 			0 /* flags */);
594 	}
595 
596 	for (size_t i = 0; i < kParallelize1DRange; i++) {
597 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
598 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
599 			<< "(expected: " << kIncrementIterations << ")";
600 	}
601 }
602 
TEST(Parallelize1DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)603 TEST(Parallelize1DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
604 	std::vector<std::atomic_int> counters(kParallelize1DRange);
605 
606 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
607 	ASSERT_TRUE(threadpool.get());
608 
609 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
610 		GTEST_SKIP();
611 	}
612 
613 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
614 		pthreadpool_parallelize_1d_with_uarch(
615 			threadpool.get(),
616 			reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
617 			static_cast<void*>(counters.data()),
618 			kDefaultUArchIndex,
619 			kMaxUArchIndex,
620 			kParallelize1DRange,
621 			0 /* flags */);
622 	}
623 
624 	for (size_t i = 0; i < kParallelize1DRange; i++) {
625 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
626 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
627 			<< "(expected: " << kIncrementIterations << ")";
628 	}
629 }
630 
IncrementSame1DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i)631 static void IncrementSame1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i) {
632 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
633 }
634 
TEST(Parallelize1DWithUArch,MultiThreadPoolHighContention)635 TEST(Parallelize1DWithUArch, MultiThreadPoolHighContention) {
636 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
637 
638 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
639 	ASSERT_TRUE(threadpool.get());
640 
641 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
642 		GTEST_SKIP();
643 	}
644 
645 	pthreadpool_parallelize_1d_with_uarch(
646 		threadpool.get(),
647 		reinterpret_cast<pthreadpool_task_1d_with_id_t>(IncrementSame1DWithUArch),
648 		static_cast<void*>(&num_processed_items),
649 		kDefaultUArchIndex,
650 		kMaxUArchIndex,
651 		kParallelize1DRange,
652 		0 /* flags */);
653 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
654 }
655 
WorkImbalance1DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i)656 static void WorkImbalance1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i) {
657 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
658 	if (i == 0) {
659 		/* Spin-wait until all items are computed */
660 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) {
661 			std::atomic_thread_fence(std::memory_order_acquire);
662 		}
663 	}
664 }
665 
TEST(Parallelize1DWithUArch,MultiThreadPoolWorkStealing)666 TEST(Parallelize1DWithUArch, MultiThreadPoolWorkStealing) {
667 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
668 
669 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
670 	ASSERT_TRUE(threadpool.get());
671 
672 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
673 		GTEST_SKIP();
674 	}
675 
676 	pthreadpool_parallelize_1d_with_uarch(
677 		threadpool.get(),
678 		reinterpret_cast<pthreadpool_task_1d_with_id_t>(WorkImbalance1DWithUArch),
679 		static_cast<void*>(&num_processed_items),
680 		kDefaultUArchIndex,
681 		kMaxUArchIndex,
682 		kParallelize1DRange,
683 		0 /* flags */);
684 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
685 }
686 
ComputeNothing1DTile1D(void *,size_t,size_t)687 static void ComputeNothing1DTile1D(void*, size_t, size_t) {
688 }
689 
TEST(Parallelize1DTile1D,SingleThreadPoolCompletes)690 TEST(Parallelize1DTile1D, SingleThreadPoolCompletes) {
691 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
692 	ASSERT_TRUE(threadpool.get());
693 
694 	pthreadpool_parallelize_1d_tile_1d(threadpool.get(),
695 		ComputeNothing1DTile1D,
696 		nullptr,
697 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
698 		0 /* flags */);
699 }
700 
TEST(Parallelize1DTile1D,MultiThreadPoolCompletes)701 TEST(Parallelize1DTile1D, MultiThreadPoolCompletes) {
702 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
703 	ASSERT_TRUE(threadpool.get());
704 
705 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
706 		GTEST_SKIP();
707 	}
708 
709 	pthreadpool_parallelize_1d_tile_1d(
710 		threadpool.get(),
711 		ComputeNothing1DTile1D,
712 		nullptr,
713 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
714 		0 /* flags */);
715 }
716 
CheckBounds1DTile1D(void *,size_t start_i,size_t tile_i)717 static void CheckBounds1DTile1D(void*, size_t start_i, size_t tile_i) {
718 	EXPECT_LT(start_i, kParallelize1DTile1DRange);
719 	EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange);
720 }
721 
TEST(Parallelize1DTile1D,SingleThreadPoolAllItemsInBounds)722 TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsInBounds) {
723 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
724 	ASSERT_TRUE(threadpool.get());
725 
726 	pthreadpool_parallelize_1d_tile_1d(
727 		threadpool.get(),
728 		CheckBounds1DTile1D,
729 		nullptr,
730 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
731 		0 /* flags */);
732 }
733 
TEST(Parallelize1DTile1D,MultiThreadPoolAllItemsInBounds)734 TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsInBounds) {
735 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
736 	ASSERT_TRUE(threadpool.get());
737 
738 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
739 		GTEST_SKIP();
740 	}
741 
742 	pthreadpool_parallelize_1d_tile_1d(
743 		threadpool.get(),
744 		CheckBounds1DTile1D,
745 		nullptr,
746 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
747 		0 /* flags */);
748 }
749 
CheckTiling1DTile1D(void *,size_t start_i,size_t tile_i)750 static void CheckTiling1DTile1D(void*, size_t start_i, size_t tile_i) {
751 	EXPECT_GT(tile_i, 0);
752 	EXPECT_LE(tile_i, kParallelize1DTile1DTile);
753 	EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0);
754 	EXPECT_EQ(tile_i, std::min<size_t>(kParallelize1DTile1DTile, kParallelize1DTile1DRange - start_i));
755 }
756 
TEST(Parallelize1DTile1D,SingleThreadPoolUniformTiling)757 TEST(Parallelize1DTile1D, SingleThreadPoolUniformTiling) {
758 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
759 	ASSERT_TRUE(threadpool.get());
760 
761 	pthreadpool_parallelize_1d_tile_1d(
762 		threadpool.get(),
763 		CheckTiling1DTile1D,
764 		nullptr,
765 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
766 		0 /* flags */);
767 }
768 
TEST(Parallelize1DTile1D,MultiThreadPoolUniformTiling)769 TEST(Parallelize1DTile1D, MultiThreadPoolUniformTiling) {
770 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
771 	ASSERT_TRUE(threadpool.get());
772 
773 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
774 		GTEST_SKIP();
775 	}
776 
777 	pthreadpool_parallelize_1d_tile_1d(
778 		threadpool.get(),
779 		CheckTiling1DTile1D,
780 		nullptr,
781 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
782 		0 /* flags */);
783 }
784 
SetTrue1DTile1D(std::atomic_bool * processed_indicators,size_t start_i,size_t tile_i)785 static void SetTrue1DTile1D(std::atomic_bool* processed_indicators, size_t start_i, size_t tile_i) {
786 	for (size_t i = start_i; i < start_i + tile_i; i++) {
787 		processed_indicators[i].store(true, std::memory_order_relaxed);
788 	}
789 }
790 
TEST(Parallelize1DTile1D,SingleThreadPoolAllItemsProcessed)791 TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsProcessed) {
792 	std::vector<std::atomic_bool> indicators(kParallelize1DTile1DRange);
793 
794 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
795 	ASSERT_TRUE(threadpool.get());
796 
797 	pthreadpool_parallelize_1d_tile_1d(
798 		threadpool.get(),
799 		reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(SetTrue1DTile1D),
800 		static_cast<void*>(indicators.data()),
801 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
802 		0 /* flags */);
803 
804 	for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
805 		EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
806 			<< "Element " << i << " not processed";
807 	}
808 }
809 
TEST(Parallelize1DTile1D,MultiThreadPoolAllItemsProcessed)810 TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsProcessed) {
811 	std::vector<std::atomic_bool> indicators(kParallelize1DTile1DRange);
812 
813 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
814 	ASSERT_TRUE(threadpool.get());
815 
816 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
817 		GTEST_SKIP();
818 	}
819 
820 	pthreadpool_parallelize_1d_tile_1d(
821 		threadpool.get(),
822 		reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(SetTrue1DTile1D),
823 		static_cast<void*>(indicators.data()),
824 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
825 		0 /* flags */);
826 
827 	for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
828 		EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
829 			<< "Element " << i << " not processed";
830 	}
831 }
832 
Increment1DTile1D(std::atomic_int * processed_counters,size_t start_i,size_t tile_i)833 static void Increment1DTile1D(std::atomic_int* processed_counters, size_t start_i, size_t tile_i) {
834 	for (size_t i = start_i; i < start_i + tile_i; i++) {
835 		processed_counters[i].fetch_add(1, std::memory_order_relaxed);
836 	}
837 }
838 
TEST(Parallelize1DTile1D,SingleThreadPoolEachItemProcessedOnce)839 TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedOnce) {
840 	std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
841 
842 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
843 	ASSERT_TRUE(threadpool.get());
844 
845 	pthreadpool_parallelize_1d_tile_1d(
846 		threadpool.get(),
847 		reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
848 		static_cast<void*>(counters.data()),
849 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
850 		0 /* flags */);
851 
852 	for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
853 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
854 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
855 	}
856 }
857 
TEST(Parallelize1DTile1D,MultiThreadPoolEachItemProcessedOnce)858 TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedOnce) {
859 	std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
860 
861 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
862 	ASSERT_TRUE(threadpool.get());
863 
864 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
865 		GTEST_SKIP();
866 	}
867 
868 	pthreadpool_parallelize_1d_tile_1d(
869 		threadpool.get(),
870 		reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
871 		static_cast<void*>(counters.data()),
872 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
873 		0 /* flags */);
874 
875 	for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
876 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
877 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
878 	}
879 }
880 
TEST(Parallelize1DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)881 TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
882 	std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
883 
884 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
885 	ASSERT_TRUE(threadpool.get());
886 
887 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
888 		pthreadpool_parallelize_1d_tile_1d(
889 			threadpool.get(),
890 			reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
891 			static_cast<void*>(counters.data()),
892 			kParallelize1DTile1DRange, kParallelize1DTile1DTile,
893 			0 /* flags */);
894 	}
895 
896 	for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
897 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
898 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
899 			<< "(expected: " << kIncrementIterations << ")";
900 	}
901 }
902 
TEST(Parallelize1DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)903 TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
904 	std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
905 
906 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
907 	ASSERT_TRUE(threadpool.get());
908 
909 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
910 		GTEST_SKIP();
911 	}
912 
913 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
914 		pthreadpool_parallelize_1d_tile_1d(
915 			threadpool.get(),
916 			reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
917 			static_cast<void*>(counters.data()),
918 			kParallelize1DTile1DRange, kParallelize1DTile1DTile,
919 			0 /* flags */);
920 	}
921 
922 	for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
923 		EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
924 			<< "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
925 			<< "(expected: " << kIncrementIterations << ")";
926 	}
927 }
928 
IncrementSame1DTile1D(std::atomic_int * num_processed_items,size_t start_i,size_t tile_i)929 static void IncrementSame1DTile1D(std::atomic_int* num_processed_items, size_t start_i, size_t tile_i) {
930 	for (size_t i = start_i; i < start_i + tile_i; i++) {
931 		num_processed_items->fetch_add(1, std::memory_order_relaxed);
932 	}
933 }
934 
TEST(Parallelize1DTile1D,MultiThreadPoolHighContention)935 TEST(Parallelize1DTile1D, MultiThreadPoolHighContention) {
936 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
937 
938 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
939 	ASSERT_TRUE(threadpool.get());
940 
941 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
942 		GTEST_SKIP();
943 	}
944 
945 	pthreadpool_parallelize_1d_tile_1d(
946 		threadpool.get(),
947 		reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(IncrementSame1DTile1D),
948 		static_cast<void*>(&num_processed_items),
949 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
950 		0 /* flags */);
951 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DTile1DRange);
952 }
953 
WorkImbalance1DTile1D(std::atomic_int * num_processed_items,size_t start_i,size_t tile_i)954 static void WorkImbalance1DTile1D(std::atomic_int* num_processed_items, size_t start_i, size_t tile_i) {
955 	num_processed_items->fetch_add(tile_i, std::memory_order_relaxed);
956 	if (start_i == 0) {
957 		/* Spin-wait until all items are computed */
958 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DTile1DRange) {
959 			std::atomic_thread_fence(std::memory_order_acquire);
960 		}
961 	}
962 }
963 
TEST(Parallelize1DTile1D,MultiThreadPoolWorkStealing)964 TEST(Parallelize1DTile1D, MultiThreadPoolWorkStealing) {
965 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
966 
967 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
968 	ASSERT_TRUE(threadpool.get());
969 
970 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
971 		GTEST_SKIP();
972 	}
973 
974 	pthreadpool_parallelize_1d_tile_1d(
975 		threadpool.get(),
976 		reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(WorkImbalance1DTile1D),
977 		static_cast<void*>(&num_processed_items),
978 		kParallelize1DTile1DRange, kParallelize1DTile1DTile,
979 		0 /* flags */);
980 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DTile1DRange);
981 }
982 
ComputeNothing2D(void *,size_t,size_t)983 static void ComputeNothing2D(void*, size_t, size_t) {
984 }
985 
TEST(Parallelize2D,SingleThreadPoolCompletes)986 TEST(Parallelize2D, SingleThreadPoolCompletes) {
987 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
988 	ASSERT_TRUE(threadpool.get());
989 
990 	pthreadpool_parallelize_2d(threadpool.get(),
991 		ComputeNothing2D,
992 		nullptr,
993 		kParallelize2DRangeI, kParallelize2DRangeJ,
994 		0 /* flags */);
995 }
996 
TEST(Parallelize2D,MultiThreadPoolCompletes)997 TEST(Parallelize2D, MultiThreadPoolCompletes) {
998 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
999 	ASSERT_TRUE(threadpool.get());
1000 
1001 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1002 		GTEST_SKIP();
1003 	}
1004 
1005 	pthreadpool_parallelize_2d(
1006 		threadpool.get(),
1007 		ComputeNothing2D,
1008 		nullptr,
1009 		kParallelize2DRangeI, kParallelize2DRangeJ,
1010 		0 /* flags */);
1011 }
1012 
CheckBounds2D(void *,size_t i,size_t j)1013 static void CheckBounds2D(void*, size_t i, size_t j) {
1014 	EXPECT_LT(i, kParallelize2DRangeI);
1015 	EXPECT_LT(j, kParallelize2DRangeJ);
1016 }
1017 
TEST(Parallelize2D,SingleThreadPoolAllItemsInBounds)1018 TEST(Parallelize2D, SingleThreadPoolAllItemsInBounds) {
1019 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1020 	ASSERT_TRUE(threadpool.get());
1021 
1022 	pthreadpool_parallelize_2d(
1023 		threadpool.get(),
1024 		CheckBounds2D,
1025 		nullptr,
1026 		kParallelize2DRangeI, kParallelize2DRangeJ,
1027 		0 /* flags */);
1028 }
1029 
TEST(Parallelize2D,MultiThreadPoolAllItemsInBounds)1030 TEST(Parallelize2D, MultiThreadPoolAllItemsInBounds) {
1031 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1032 	ASSERT_TRUE(threadpool.get());
1033 
1034 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1035 		GTEST_SKIP();
1036 	}
1037 
1038 	pthreadpool_parallelize_2d(
1039 		threadpool.get(),
1040 		CheckBounds2D,
1041 		nullptr,
1042 		kParallelize2DRangeI, kParallelize2DRangeJ,
1043 		0 /* flags */);
1044 }
1045 
SetTrue2D(std::atomic_bool * processed_indicators,size_t i,size_t j)1046 static void SetTrue2D(std::atomic_bool* processed_indicators, size_t i, size_t j) {
1047 	const size_t linear_idx = i * kParallelize2DRangeJ + j;
1048 	processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
1049 }
1050 
TEST(Parallelize2D,SingleThreadPoolAllItemsProcessed)1051 TEST(Parallelize2D, SingleThreadPoolAllItemsProcessed) {
1052 	std::vector<std::atomic_bool> indicators(kParallelize2DRangeI * kParallelize2DRangeJ);
1053 
1054 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1055 	ASSERT_TRUE(threadpool.get());
1056 
1057 	pthreadpool_parallelize_2d(
1058 		threadpool.get(),
1059 		reinterpret_cast<pthreadpool_task_2d_t>(SetTrue2D),
1060 		static_cast<void*>(indicators.data()),
1061 		kParallelize2DRangeI, kParallelize2DRangeJ,
1062 		0 /* flags */);
1063 
1064 	for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1065 		for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1066 			const size_t linear_idx = i * kParallelize2DRangeJ + j;
1067 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1068 				<< "Element (" << i << ", " << j << ") not processed";
1069 		}
1070 	}
1071 }
1072 
TEST(Parallelize2D,MultiThreadPoolAllItemsProcessed)1073 TEST(Parallelize2D, MultiThreadPoolAllItemsProcessed) {
1074 	std::vector<std::atomic_bool> indicators(kParallelize2DRangeI * kParallelize2DRangeJ);
1075 
1076 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1077 	ASSERT_TRUE(threadpool.get());
1078 
1079 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1080 		GTEST_SKIP();
1081 	}
1082 
1083 	pthreadpool_parallelize_2d(
1084 		threadpool.get(),
1085 		reinterpret_cast<pthreadpool_task_2d_t>(SetTrue2D),
1086 		static_cast<void*>(indicators.data()),
1087 		kParallelize2DRangeI, kParallelize2DRangeJ,
1088 		0 /* flags */);
1089 
1090 	for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1091 		for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1092 			const size_t linear_idx = i * kParallelize2DRangeJ + j;
1093 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1094 				<< "Element (" << i << ", " << j << ") not processed";
1095 		}
1096 	}
1097 }
1098 
Increment2D(std::atomic_int * processed_counters,size_t i,size_t j)1099 static void Increment2D(std::atomic_int* processed_counters, size_t i, size_t j) {
1100 	const size_t linear_idx = i * kParallelize2DRangeJ + j;
1101 	processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
1102 }
1103 
TEST(Parallelize2D,SingleThreadPoolEachItemProcessedOnce)1104 TEST(Parallelize2D, SingleThreadPoolEachItemProcessedOnce) {
1105 	std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1106 
1107 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1108 	ASSERT_TRUE(threadpool.get());
1109 
1110 	pthreadpool_parallelize_2d(
1111 		threadpool.get(),
1112 		reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1113 		static_cast<void*>(counters.data()),
1114 		kParallelize2DRangeI, kParallelize2DRangeJ,
1115 		0 /* flags */);
1116 
1117 	for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1118 		for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1119 			const size_t linear_idx = i * kParallelize2DRangeJ + j;
1120 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1121 				<< "Element (" << i << ", " << j << ") was processed "
1122 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1123 		}
1124 	}
1125 }
1126 
TEST(Parallelize2D,MultiThreadPoolEachItemProcessedOnce)1127 TEST(Parallelize2D, MultiThreadPoolEachItemProcessedOnce) {
1128 	std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1129 
1130 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1131 	ASSERT_TRUE(threadpool.get());
1132 
1133 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1134 		GTEST_SKIP();
1135 	}
1136 
1137 	pthreadpool_parallelize_2d(
1138 		threadpool.get(),
1139 		reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1140 		static_cast<void*>(counters.data()),
1141 		kParallelize2DRangeI, kParallelize2DRangeJ,
1142 		0 /* flags */);
1143 
1144 	for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1145 		for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1146 			const size_t linear_idx = i * kParallelize2DRangeJ + j;
1147 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1148 				<< "Element (" << i << ", " << j << ") was processed "
1149 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1150 		}
1151 	}
1152 }
1153 
TEST(Parallelize2D,SingleThreadPoolEachItemProcessedMultipleTimes)1154 TEST(Parallelize2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
1155 	std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1156 
1157 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1158 	ASSERT_TRUE(threadpool.get());
1159 
1160 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1161 		pthreadpool_parallelize_2d(
1162 			threadpool.get(),
1163 			reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1164 			static_cast<void*>(counters.data()),
1165 			kParallelize2DRangeI, kParallelize2DRangeJ,
1166 			0 /* flags */);
1167 	}
1168 
1169 	for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1170 		for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1171 			const size_t linear_idx = i * kParallelize2DRangeJ + j;
1172 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1173 				<< "Element (" << i << ", " << j << ") was processed "
1174 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
1175 				<< "(expected: " << kIncrementIterations << ")";
1176 		}
1177 	}
1178 }
1179 
TEST(Parallelize2D,MultiThreadPoolEachItemProcessedMultipleTimes)1180 TEST(Parallelize2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
1181 	std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1182 
1183 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1184 	ASSERT_TRUE(threadpool.get());
1185 
1186 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1187 		GTEST_SKIP();
1188 	}
1189 
1190 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1191 		pthreadpool_parallelize_2d(
1192 			threadpool.get(),
1193 			reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1194 			static_cast<void*>(counters.data()),
1195 			kParallelize2DRangeI, kParallelize2DRangeJ,
1196 			0 /* flags */);
1197 	}
1198 
1199 	for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1200 		for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1201 			const size_t linear_idx = i * kParallelize2DRangeJ + j;
1202 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1203 				<< "Element (" << i << ", " << j << ") was processed "
1204 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
1205 				<< "(expected: " << kIncrementIterations << ")";
1206 		}
1207 	}
1208 }
1209 
IncrementSame2D(std::atomic_int * num_processed_items,size_t i,size_t j)1210 static void IncrementSame2D(std::atomic_int* num_processed_items, size_t i, size_t j) {
1211 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
1212 }
1213 
TEST(Parallelize2D,MultiThreadPoolHighContention)1214 TEST(Parallelize2D, MultiThreadPoolHighContention) {
1215 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1216 
1217 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1218 	ASSERT_TRUE(threadpool.get());
1219 
1220 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1221 		GTEST_SKIP();
1222 	}
1223 
1224 	pthreadpool_parallelize_2d(
1225 		threadpool.get(),
1226 		reinterpret_cast<pthreadpool_task_2d_t>(IncrementSame2D),
1227 		static_cast<void*>(&num_processed_items),
1228 		kParallelize2DRangeI, kParallelize2DRangeJ,
1229 		0 /* flags */);
1230 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ);
1231 }
1232 
WorkImbalance2D(std::atomic_int * num_processed_items,size_t i,size_t j)1233 static void WorkImbalance2D(std::atomic_int* num_processed_items, size_t i, size_t j) {
1234 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
1235 	if (i == 0 && j == 0) {
1236 		/* Spin-wait until all items are computed */
1237 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DRangeI * kParallelize2DRangeJ) {
1238 			std::atomic_thread_fence(std::memory_order_acquire);
1239 		}
1240 	}
1241 }
1242 
TEST(Parallelize2D,MultiThreadPoolWorkStealing)1243 TEST(Parallelize2D, MultiThreadPoolWorkStealing) {
1244 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1245 
1246 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1247 	ASSERT_TRUE(threadpool.get());
1248 
1249 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1250 		GTEST_SKIP();
1251 	}
1252 
1253 	pthreadpool_parallelize_2d(
1254 		threadpool.get(),
1255 		reinterpret_cast<pthreadpool_task_2d_t>(WorkImbalance2D),
1256 		static_cast<void*>(&num_processed_items),
1257 		kParallelize2DRangeI, kParallelize2DRangeJ,
1258 		0 /* flags */);
1259 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ);
1260 }
1261 
ComputeNothing2DTile1D(void *,size_t,size_t,size_t)1262 static void ComputeNothing2DTile1D(void*, size_t, size_t, size_t) {
1263 }
1264 
TEST(Parallelize2DTile1D,SingleThreadPoolCompletes)1265 TEST(Parallelize2DTile1D, SingleThreadPoolCompletes) {
1266 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1267 	ASSERT_TRUE(threadpool.get());
1268 
1269 	pthreadpool_parallelize_2d_tile_1d(threadpool.get(),
1270 		ComputeNothing2DTile1D,
1271 		nullptr,
1272 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1273 		0 /* flags */);
1274 }
1275 
TEST(Parallelize2DTile1D,MultiThreadPoolCompletes)1276 TEST(Parallelize2DTile1D, MultiThreadPoolCompletes) {
1277 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1278 	ASSERT_TRUE(threadpool.get());
1279 
1280 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1281 		GTEST_SKIP();
1282 	}
1283 
1284 	pthreadpool_parallelize_2d_tile_1d(
1285 		threadpool.get(),
1286 		ComputeNothing2DTile1D,
1287 		nullptr,
1288 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1289 		0 /* flags */);
1290 }
1291 
CheckBounds2DTile1D(void *,size_t i,size_t start_j,size_t tile_j)1292 static void CheckBounds2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) {
1293 	EXPECT_LT(i, kParallelize2DTile1DRangeI);
1294 	EXPECT_LT(start_j, kParallelize2DTile1DRangeJ);
1295 	EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ);
1296 }
1297 
TEST(Parallelize2DTile1D,SingleThreadPoolAllItemsInBounds)1298 TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsInBounds) {
1299 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1300 	ASSERT_TRUE(threadpool.get());
1301 
1302 	pthreadpool_parallelize_2d_tile_1d(
1303 		threadpool.get(),
1304 		CheckBounds2DTile1D,
1305 		nullptr,
1306 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1307 		0 /* flags */);
1308 }
1309 
TEST(Parallelize2DTile1D,MultiThreadPoolAllItemsInBounds)1310 TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsInBounds) {
1311 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1312 	ASSERT_TRUE(threadpool.get());
1313 
1314 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1315 		GTEST_SKIP();
1316 	}
1317 
1318 	pthreadpool_parallelize_2d_tile_1d(
1319 		threadpool.get(),
1320 		CheckBounds2DTile1D,
1321 		nullptr,
1322 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1323 		0 /* flags */);
1324 }
1325 
CheckTiling2DTile1D(void *,size_t i,size_t start_j,size_t tile_j)1326 static void CheckTiling2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) {
1327 	EXPECT_GT(tile_j, 0);
1328 	EXPECT_LE(tile_j, kParallelize2DTile1DTileJ);
1329 	EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0);
1330 	EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile1DTileJ, kParallelize2DTile1DRangeJ - start_j));
1331 }
1332 
TEST(Parallelize2DTile1D,SingleThreadPoolUniformTiling)1333 TEST(Parallelize2DTile1D, SingleThreadPoolUniformTiling) {
1334 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1335 	ASSERT_TRUE(threadpool.get());
1336 
1337 	pthreadpool_parallelize_2d_tile_1d(
1338 		threadpool.get(),
1339 		CheckTiling2DTile1D,
1340 		nullptr,
1341 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1342 		0 /* flags */);
1343 }
1344 
TEST(Parallelize2DTile1D,MultiThreadPoolUniformTiling)1345 TEST(Parallelize2DTile1D, MultiThreadPoolUniformTiling) {
1346 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1347 	ASSERT_TRUE(threadpool.get());
1348 
1349 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1350 		GTEST_SKIP();
1351 	}
1352 
1353 	pthreadpool_parallelize_2d_tile_1d(
1354 		threadpool.get(),
1355 		CheckTiling2DTile1D,
1356 		nullptr,
1357 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1358 		0 /* flags */);
1359 }
1360 
SetTrue2DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t start_j,size_t tile_j)1361 static void SetTrue2DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t tile_j) {
1362 	for (size_t j = start_j; j < start_j + tile_j; j++) {
1363 		const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1364 		processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
1365 	}
1366 }
1367 
TEST(Parallelize2DTile1D,SingleThreadPoolAllItemsProcessed)1368 TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsProcessed) {
1369 	std::vector<std::atomic_bool> indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1370 
1371 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1372 	ASSERT_TRUE(threadpool.get());
1373 
1374 	pthreadpool_parallelize_2d_tile_1d(
1375 		threadpool.get(),
1376 		reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(SetTrue2DTile1D),
1377 		static_cast<void*>(indicators.data()),
1378 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1379 		0 /* flags */);
1380 
1381 	for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1382 		for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1383 			const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1384 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1385 				<< "Element (" << i << ", " << j << ") not processed";
1386 		}
1387 	}
1388 }
1389 
TEST(Parallelize2DTile1D,MultiThreadPoolAllItemsProcessed)1390 TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsProcessed) {
1391 	std::vector<std::atomic_bool> indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1392 
1393 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1394 	ASSERT_TRUE(threadpool.get());
1395 
1396 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1397 		GTEST_SKIP();
1398 	}
1399 
1400 	pthreadpool_parallelize_2d_tile_1d(
1401 		threadpool.get(),
1402 		reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(SetTrue2DTile1D),
1403 		static_cast<void*>(indicators.data()),
1404 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1405 		0 /* flags */);
1406 
1407 	for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1408 		for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1409 			const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1410 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1411 				<< "Element (" << i << ", " << j << ") not processed";
1412 		}
1413 	}
1414 }
1415 
Increment2DTile1D(std::atomic_int * processed_counters,size_t i,size_t start_j,size_t tile_j)1416 static void Increment2DTile1D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t tile_j) {
1417 	for (size_t j = start_j; j < start_j + tile_j; j++) {
1418 		const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1419 		processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
1420 	}
1421 }
1422 
TEST(Parallelize2DTile1D,SingleThreadPoolEachItemProcessedOnce)1423 TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedOnce) {
1424 	std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1425 
1426 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1427 	ASSERT_TRUE(threadpool.get());
1428 
1429 	pthreadpool_parallelize_2d_tile_1d(
1430 		threadpool.get(),
1431 		reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1432 		static_cast<void*>(counters.data()),
1433 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1434 		0 /* flags */);
1435 
1436 	for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1437 		for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1438 			const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1439 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1440 				<< "Element (" << i << ", " << j << ") was processed "
1441 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1442 		}
1443 	}
1444 }
1445 
TEST(Parallelize2DTile1D,MultiThreadPoolEachItemProcessedOnce)1446 TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedOnce) {
1447 	std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1448 
1449 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1450 	ASSERT_TRUE(threadpool.get());
1451 
1452 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1453 		GTEST_SKIP();
1454 	}
1455 
1456 	pthreadpool_parallelize_2d_tile_1d(
1457 		threadpool.get(),
1458 		reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1459 		static_cast<void*>(counters.data()),
1460 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1461 		0 /* flags */);
1462 
1463 	for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1464 		for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1465 			const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1466 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1467 				<< "Element (" << i << ", " << j << ") was processed "
1468 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1469 		}
1470 	}
1471 }
1472 
TEST(Parallelize2DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)1473 TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
1474 	std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1475 
1476 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1477 	ASSERT_TRUE(threadpool.get());
1478 
1479 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1480 		pthreadpool_parallelize_2d_tile_1d(
1481 			threadpool.get(),
1482 			reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1483 			static_cast<void*>(counters.data()),
1484 			kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1485 			0 /* flags */);
1486 	}
1487 
1488 	for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1489 		for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1490 			const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1491 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1492 				<< "Element (" << i << ", " << j << ") was processed "
1493 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
1494 				<< "(expected: " << kIncrementIterations << ")";
1495 		}
1496 	}
1497 }
1498 
TEST(Parallelize2DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)1499 TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
1500 	std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1501 
1502 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1503 	ASSERT_TRUE(threadpool.get());
1504 
1505 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1506 		GTEST_SKIP();
1507 	}
1508 
1509 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1510 		pthreadpool_parallelize_2d_tile_1d(
1511 			threadpool.get(),
1512 			reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1513 			static_cast<void*>(counters.data()),
1514 			kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1515 			0 /* flags */);
1516 	}
1517 
1518 	for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1519 		for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1520 			const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1521 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1522 				<< "Element (" << i << ", " << j << ") was processed "
1523 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
1524 				<< "(expected: " << kIncrementIterations << ")";
1525 		}
1526 	}
1527 }
1528 
IncrementSame2DTile1D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t tile_j)1529 static void IncrementSame2DTile1D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t tile_j) {
1530 	for (size_t j = start_j; j < start_j + tile_j; j++) {
1531 		num_processed_items->fetch_add(1, std::memory_order_relaxed);
1532 	}
1533 }
1534 
TEST(Parallelize2DTile1D,MultiThreadPoolHighContention)1535 TEST(Parallelize2DTile1D, MultiThreadPoolHighContention) {
1536 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1537 
1538 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1539 	ASSERT_TRUE(threadpool.get());
1540 
1541 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1542 		GTEST_SKIP();
1543 	}
1544 
1545 	pthreadpool_parallelize_2d_tile_1d(
1546 		threadpool.get(),
1547 		reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(IncrementSame2DTile1D),
1548 		static_cast<void*>(&num_processed_items),
1549 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1550 		0 /* flags */);
1551 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1552 }
1553 
WorkImbalance2DTile1D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t tile_j)1554 static void WorkImbalance2DTile1D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t tile_j) {
1555 	num_processed_items->fetch_add(tile_j, std::memory_order_relaxed);
1556 	if (i == 0 && start_j == 0) {
1557 		/* Spin-wait until all items are computed */
1558 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) {
1559 			std::atomic_thread_fence(std::memory_order_acquire);
1560 		}
1561 	}
1562 }
1563 
TEST(Parallelize2DTile1D,MultiThreadPoolWorkStealing)1564 TEST(Parallelize2DTile1D, MultiThreadPoolWorkStealing) {
1565 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1566 
1567 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1568 	ASSERT_TRUE(threadpool.get());
1569 
1570 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1571 		GTEST_SKIP();
1572 	}
1573 
1574 	pthreadpool_parallelize_2d_tile_1d(
1575 		threadpool.get(),
1576 		reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(WorkImbalance2DTile1D),
1577 		static_cast<void*>(&num_processed_items),
1578 		kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1579 		0 /* flags */);
1580 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1581 }
1582 
ComputeNothing2DTile2D(void *,size_t,size_t,size_t,size_t)1583 static void ComputeNothing2DTile2D(void*, size_t, size_t, size_t, size_t) {
1584 }
1585 
TEST(Parallelize2DTile2D,SingleThreadPoolCompletes)1586 TEST(Parallelize2DTile2D, SingleThreadPoolCompletes) {
1587 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1588 	ASSERT_TRUE(threadpool.get());
1589 
1590 	pthreadpool_parallelize_2d_tile_2d(threadpool.get(),
1591 		ComputeNothing2DTile2D,
1592 		nullptr,
1593 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1594 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1595 		0 /* flags */);
1596 }
1597 
TEST(Parallelize2DTile2D,MultiThreadPoolCompletes)1598 TEST(Parallelize2DTile2D, MultiThreadPoolCompletes) {
1599 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1600 	ASSERT_TRUE(threadpool.get());
1601 
1602 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1603 		GTEST_SKIP();
1604 	}
1605 
1606 	pthreadpool_parallelize_2d_tile_2d(
1607 		threadpool.get(),
1608 		ComputeNothing2DTile2D,
1609 		nullptr,
1610 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1611 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1612 		0 /* flags */);
1613 }
1614 
CheckBounds2DTile2D(void *,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1615 static void CheckBounds2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1616 	EXPECT_LT(start_i, kParallelize2DTile2DRangeI);
1617 	EXPECT_LT(start_j, kParallelize2DTile2DRangeJ);
1618 	EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI);
1619 	EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ);
1620 }
1621 
TEST(Parallelize2DTile2D,SingleThreadPoolAllItemsInBounds)1622 TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsInBounds) {
1623 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1624 	ASSERT_TRUE(threadpool.get());
1625 
1626 	pthreadpool_parallelize_2d_tile_2d(
1627 		threadpool.get(),
1628 		CheckBounds2DTile2D,
1629 		nullptr,
1630 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1631 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1632 		0 /* flags */);
1633 }
1634 
TEST(Parallelize2DTile2D,MultiThreadPoolAllItemsInBounds)1635 TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsInBounds) {
1636 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1637 	ASSERT_TRUE(threadpool.get());
1638 
1639 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1640 		GTEST_SKIP();
1641 	}
1642 
1643 	pthreadpool_parallelize_2d_tile_2d(
1644 		threadpool.get(),
1645 		CheckBounds2DTile2D,
1646 		nullptr,
1647 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1648 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1649 		0 /* flags */);
1650 }
1651 
CheckTiling2DTile2D(void *,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1652 static void CheckTiling2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1653 	EXPECT_GT(tile_i, 0);
1654 	EXPECT_LE(tile_i, kParallelize2DTile2DTileI);
1655 	EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0);
1656 	EXPECT_EQ(tile_i, std::min<size_t>(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i));
1657 
1658 	EXPECT_GT(tile_j, 0);
1659 	EXPECT_LE(tile_j, kParallelize2DTile2DTileJ);
1660 	EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0);
1661 	EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j));
1662 }
1663 
TEST(Parallelize2DTile2D,SingleThreadPoolUniformTiling)1664 TEST(Parallelize2DTile2D, SingleThreadPoolUniformTiling) {
1665 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1666 	ASSERT_TRUE(threadpool.get());
1667 
1668 	pthreadpool_parallelize_2d_tile_2d(
1669 		threadpool.get(),
1670 		CheckTiling2DTile2D,
1671 		nullptr,
1672 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1673 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1674 		0 /* flags */);
1675 }
1676 
TEST(Parallelize2DTile2D,MultiThreadPoolUniformTiling)1677 TEST(Parallelize2DTile2D, MultiThreadPoolUniformTiling) {
1678 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1679 	ASSERT_TRUE(threadpool.get());
1680 
1681 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1682 		GTEST_SKIP();
1683 	}
1684 
1685 	pthreadpool_parallelize_2d_tile_2d(
1686 		threadpool.get(),
1687 		CheckTiling2DTile2D,
1688 		nullptr,
1689 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1690 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1691 		0 /* flags */);
1692 }
1693 
SetTrue2DTile2D(std::atomic_bool * processed_indicators,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1694 static void SetTrue2DTile2D(std::atomic_bool* processed_indicators, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1695 	for (size_t i = start_i; i < start_i + tile_i; i++) {
1696 		for (size_t j = start_j; j < start_j + tile_j; j++) {
1697 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1698 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
1699 		}
1700 	}
1701 }
1702 
TEST(Parallelize2DTile2D,SingleThreadPoolAllItemsProcessed)1703 TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsProcessed) {
1704 	std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1705 
1706 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1707 	ASSERT_TRUE(threadpool.get());
1708 
1709 	pthreadpool_parallelize_2d_tile_2d(
1710 		threadpool.get(),
1711 		reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(SetTrue2DTile2D),
1712 		static_cast<void*>(indicators.data()),
1713 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1714 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1715 		0 /* flags */);
1716 
1717 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1718 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1719 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1720 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1721 				<< "Element (" << i << ", " << j << ") not processed";
1722 		}
1723 	}
1724 }
1725 
TEST(Parallelize2DTile2D,MultiThreadPoolAllItemsProcessed)1726 TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsProcessed) {
1727 	std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1728 
1729 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1730 	ASSERT_TRUE(threadpool.get());
1731 
1732 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1733 		GTEST_SKIP();
1734 	}
1735 
1736 	pthreadpool_parallelize_2d_tile_2d(
1737 		threadpool.get(),
1738 		reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(SetTrue2DTile2D),
1739 		static_cast<void*>(indicators.data()),
1740 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1741 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1742 		0 /* flags */);
1743 
1744 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1745 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1746 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1747 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1748 				<< "Element (" << i << ", " << j << ") not processed";
1749 		}
1750 	}
1751 }
1752 
Increment2DTile2D(std::atomic_int * processed_counters,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1753 static void Increment2DTile2D(std::atomic_int* processed_counters, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1754 	for (size_t i = start_i; i < start_i + tile_i; i++) {
1755 		for (size_t j = start_j; j < start_j + tile_j; j++) {
1756 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1757 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
1758 		}
1759 	}
1760 }
1761 
TEST(Parallelize2DTile2D,SingleThreadPoolEachItemProcessedOnce)1762 TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) {
1763 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1764 
1765 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1766 	ASSERT_TRUE(threadpool.get());
1767 
1768 	pthreadpool_parallelize_2d_tile_2d(
1769 		threadpool.get(),
1770 		reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1771 		static_cast<void*>(counters.data()),
1772 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1773 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1774 		0 /* flags */);
1775 
1776 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1777 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1778 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1779 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1780 				<< "Element (" << i << ", " << j << ") was processed "
1781 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1782 		}
1783 	}
1784 }
1785 
TEST(Parallelize2DTile2D,MultiThreadPoolEachItemProcessedOnce)1786 TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) {
1787 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1788 
1789 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1790 	ASSERT_TRUE(threadpool.get());
1791 
1792 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1793 		GTEST_SKIP();
1794 	}
1795 
1796 	pthreadpool_parallelize_2d_tile_2d(
1797 		threadpool.get(),
1798 		reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1799 		static_cast<void*>(counters.data()),
1800 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1801 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1802 		0 /* flags */);
1803 
1804 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1805 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1806 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1807 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1808 				<< "Element (" << i << ", " << j << ") was processed "
1809 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1810 		}
1811 	}
1812 }
1813 
TEST(Parallelize2DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)1814 TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
1815 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1816 
1817 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1818 	ASSERT_TRUE(threadpool.get());
1819 
1820 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1821 		pthreadpool_parallelize_2d_tile_2d(
1822 			threadpool.get(),
1823 			reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1824 			static_cast<void*>(counters.data()),
1825 			kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1826 			kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1827 			0 /* flags */);
1828 	}
1829 
1830 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1831 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1832 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1833 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1834 				<< "Element (" << i << ", " << j << ") was processed "
1835 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
1836 				<< "(expected: " << kIncrementIterations << ")";
1837 		}
1838 	}
1839 }
1840 
TEST(Parallelize2DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)1841 TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
1842 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1843 
1844 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1845 	ASSERT_TRUE(threadpool.get());
1846 
1847 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1848 		GTEST_SKIP();
1849 	}
1850 
1851 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1852 		pthreadpool_parallelize_2d_tile_2d(
1853 			threadpool.get(),
1854 			reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1855 			static_cast<void*>(counters.data()),
1856 			kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1857 			kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1858 			0 /* flags */);
1859 	}
1860 
1861 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1862 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1863 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1864 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1865 				<< "Element (" << i << ", " << j << ") was processed "
1866 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
1867 				<< "(expected: " << kIncrementIterations << ")";
1868 		}
1869 	}
1870 }
1871 
IncrementSame2DTile2D(std::atomic_int * num_processed_items,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1872 static void IncrementSame2DTile2D(std::atomic_int* num_processed_items, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1873 	for (size_t i = start_i; i < start_i + tile_i; i++) {
1874 		for (size_t j = start_j; j < start_j + tile_j; j++) {
1875 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
1876 		}
1877 	}
1878 }
1879 
TEST(Parallelize2DTile2D,MultiThreadPoolHighContention)1880 TEST(Parallelize2DTile2D, MultiThreadPoolHighContention) {
1881 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1882 
1883 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1884 	ASSERT_TRUE(threadpool.get());
1885 
1886 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1887 		GTEST_SKIP();
1888 	}
1889 
1890 	pthreadpool_parallelize_2d_tile_2d(
1891 		threadpool.get(),
1892 		reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(IncrementSame2DTile2D),
1893 		static_cast<void*>(&num_processed_items),
1894 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1895 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1896 		0 /* flags */);
1897 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1898 }
1899 
WorkImbalance2DTile2D(std::atomic_int * num_processed_items,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1900 static void WorkImbalance2DTile2D(std::atomic_int* num_processed_items, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1901 	num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed);
1902 	if (start_i == 0 && start_j == 0) {
1903 		/* Spin-wait until all items are computed */
1904 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) {
1905 			std::atomic_thread_fence(std::memory_order_acquire);
1906 		}
1907 	}
1908 }
1909 
TEST(Parallelize2DTile2D,MultiThreadPoolWorkStealing)1910 TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) {
1911 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1912 
1913 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1914 	ASSERT_TRUE(threadpool.get());
1915 
1916 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1917 		GTEST_SKIP();
1918 	}
1919 
1920 	pthreadpool_parallelize_2d_tile_2d(
1921 		threadpool.get(),
1922 		reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(WorkImbalance2DTile2D),
1923 		static_cast<void*>(&num_processed_items),
1924 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1925 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1926 		0 /* flags */);
1927 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1928 }
1929 
ComputeNothing2DTile2DWithUArch(void *,uint32_t,size_t,size_t,size_t,size_t)1930 static void ComputeNothing2DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t) {
1931 }
1932 
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolCompletes)1933 TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolCompletes) {
1934 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1935 	ASSERT_TRUE(threadpool.get());
1936 
1937 	pthreadpool_parallelize_2d_tile_2d_with_uarch(threadpool.get(),
1938 		ComputeNothing2DTile2DWithUArch,
1939 		nullptr,
1940 		kDefaultUArchIndex, kMaxUArchIndex,
1941 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1942 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1943 		0 /* flags */);
1944 }
1945 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolCompletes)1946 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolCompletes) {
1947 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1948 	ASSERT_TRUE(threadpool.get());
1949 
1950 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1951 		GTEST_SKIP();
1952 	}
1953 
1954 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
1955 		threadpool.get(),
1956 		ComputeNothing2DTile2DWithUArch,
1957 		nullptr,
1958 		kDefaultUArchIndex, kMaxUArchIndex,
1959 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1960 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1961 		0 /* flags */);
1962 }
1963 
CheckUArch2DTile2DWithUArch(void *,uint32_t uarch_index,size_t,size_t,size_t,size_t)1964 static void CheckUArch2DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t) {
1965 	if (uarch_index != kDefaultUArchIndex) {
1966 		EXPECT_LE(uarch_index, kMaxUArchIndex);
1967 	}
1968 }
1969 
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolUArchInBounds)1970 TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUArchInBounds) {
1971 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1972 	ASSERT_TRUE(threadpool.get());
1973 
1974 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
1975 		threadpool.get(),
1976 		CheckUArch2DTile2DWithUArch,
1977 		nullptr,
1978 		kDefaultUArchIndex, kMaxUArchIndex,
1979 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1980 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1981 		0 /* flags */);
1982 }
1983 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolUArchInBounds)1984 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUArchInBounds) {
1985 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1986 	ASSERT_TRUE(threadpool.get());
1987 
1988 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1989 		GTEST_SKIP();
1990 	}
1991 
1992 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
1993 		threadpool.get(),
1994 		CheckUArch2DTile2DWithUArch,
1995 		nullptr,
1996 		kDefaultUArchIndex, kMaxUArchIndex,
1997 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1998 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1999 		0 /* flags */);
2000 }
2001 
CheckBounds2DTile2DWithUArch(void *,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2002 static void CheckBounds2DTile2DWithUArch(void*, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2003 	EXPECT_LT(start_i, kParallelize2DTile2DRangeI);
2004 	EXPECT_LT(start_j, kParallelize2DTile2DRangeJ);
2005 	EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI);
2006 	EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ);
2007 }
2008 
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolAllItemsInBounds)2009 TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) {
2010 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2011 	ASSERT_TRUE(threadpool.get());
2012 
2013 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2014 		threadpool.get(),
2015 		CheckBounds2DTile2DWithUArch,
2016 		nullptr,
2017 		kDefaultUArchIndex, kMaxUArchIndex,
2018 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2019 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2020 		0 /* flags */);
2021 }
2022 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolAllItemsInBounds)2023 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) {
2024 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2025 	ASSERT_TRUE(threadpool.get());
2026 
2027 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2028 		GTEST_SKIP();
2029 	}
2030 
2031 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2032 		threadpool.get(),
2033 		CheckBounds2DTile2DWithUArch,
2034 		nullptr,
2035 		kDefaultUArchIndex, kMaxUArchIndex,
2036 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2037 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2038 		0 /* flags */);
2039 }
2040 
CheckTiling2DTile2DWithUArch(void *,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2041 static void CheckTiling2DTile2DWithUArch(void*, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2042 	EXPECT_GT(tile_i, 0);
2043 	EXPECT_LE(tile_i, kParallelize2DTile2DTileI);
2044 	EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0);
2045 	EXPECT_EQ(tile_i, std::min<size_t>(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i));
2046 
2047 	EXPECT_GT(tile_j, 0);
2048 	EXPECT_LE(tile_j, kParallelize2DTile2DTileJ);
2049 	EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0);
2050 	EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j));
2051 }
2052 
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolUniformTiling)2053 TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUniformTiling) {
2054 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2055 	ASSERT_TRUE(threadpool.get());
2056 
2057 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2058 		threadpool.get(),
2059 		CheckTiling2DTile2DWithUArch,
2060 		nullptr,
2061 		kDefaultUArchIndex, kMaxUArchIndex,
2062 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2063 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2064 		0 /* flags */);
2065 }
2066 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolUniformTiling)2067 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUniformTiling) {
2068 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2069 	ASSERT_TRUE(threadpool.get());
2070 
2071 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2072 		GTEST_SKIP();
2073 	}
2074 
2075 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2076 		threadpool.get(),
2077 		CheckTiling2DTile2DWithUArch,
2078 		nullptr,
2079 		kDefaultUArchIndex, kMaxUArchIndex,
2080 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2081 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2082 		0 /* flags */);
2083 }
2084 
SetTrue2DTile2DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2085 static void SetTrue2DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2086 	for (size_t i = start_i; i < start_i + tile_i; i++) {
2087 		for (size_t j = start_j; j < start_j + tile_j; j++) {
2088 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2089 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
2090 		}
2091 	}
2092 }
2093 
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolAllItemsProcessed)2094 TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) {
2095 	std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2096 
2097 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2098 	ASSERT_TRUE(threadpool.get());
2099 
2100 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2101 		threadpool.get(),
2102 		reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(SetTrue2DTile2DWithUArch),
2103 		static_cast<void*>(indicators.data()),
2104 		kDefaultUArchIndex, kMaxUArchIndex,
2105 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2106 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2107 		0 /* flags */);
2108 
2109 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2110 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2111 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2112 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2113 				<< "Element (" << i << ", " << j << ") not processed";
2114 		}
2115 	}
2116 }
2117 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolAllItemsProcessed)2118 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) {
2119 	std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2120 
2121 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2122 	ASSERT_TRUE(threadpool.get());
2123 
2124 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2125 		GTEST_SKIP();
2126 	}
2127 
2128 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2129 		threadpool.get(),
2130 		reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(SetTrue2DTile2DWithUArch),
2131 		static_cast<void*>(indicators.data()),
2132 		kDefaultUArchIndex, kMaxUArchIndex,
2133 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2134 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2135 		0 /* flags */);
2136 
2137 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2138 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2139 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2140 			EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2141 				<< "Element (" << i << ", " << j << ") not processed";
2142 		}
2143 	}
2144 }
2145 
Increment2DTile2DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2146 static void Increment2DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2147 	for (size_t i = start_i; i < start_i + tile_i; i++) {
2148 		for (size_t j = start_j; j < start_j + tile_j; j++) {
2149 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2150 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
2151 		}
2152 	}
2153 }
2154 
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolEachItemProcessedOnce)2155 TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
2156 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2157 
2158 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2159 	ASSERT_TRUE(threadpool.get());
2160 
2161 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2162 		threadpool.get(),
2163 		reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2164 		static_cast<void*>(counters.data()),
2165 		kDefaultUArchIndex, kMaxUArchIndex,
2166 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2167 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2168 		0 /* flags */);
2169 
2170 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2171 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2172 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2173 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2174 				<< "Element (" << i << ", " << j << ") was processed "
2175 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2176 		}
2177 	}
2178 }
2179 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolEachItemProcessedOnce)2180 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
2181 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2182 
2183 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2184 	ASSERT_TRUE(threadpool.get());
2185 
2186 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2187 		GTEST_SKIP();
2188 	}
2189 
2190 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2191 		threadpool.get(),
2192 		reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2193 		static_cast<void*>(counters.data()),
2194 		kDefaultUArchIndex, kMaxUArchIndex,
2195 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2196 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2197 		0 /* flags */);
2198 
2199 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2200 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2201 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2202 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2203 				<< "Element (" << i << ", " << j << ") was processed "
2204 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2205 		}
2206 	}
2207 }
2208 
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)2209 TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
2210 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2211 
2212 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2213 	ASSERT_TRUE(threadpool.get());
2214 
2215 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2216 		pthreadpool_parallelize_2d_tile_2d_with_uarch(
2217 			threadpool.get(),
2218 			reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2219 			static_cast<void*>(counters.data()),
2220 			kDefaultUArchIndex, kMaxUArchIndex,
2221 			kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2222 			kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2223 			0 /* flags */);
2224 	}
2225 
2226 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2227 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2228 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2229 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2230 				<< "Element (" << i << ", " << j << ") was processed "
2231 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
2232 				<< "(expected: " << kIncrementIterations << ")";
2233 		}
2234 	}
2235 }
2236 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)2237 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
2238 	std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2239 
2240 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2241 	ASSERT_TRUE(threadpool.get());
2242 
2243 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2244 		GTEST_SKIP();
2245 	}
2246 
2247 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2248 		pthreadpool_parallelize_2d_tile_2d_with_uarch(
2249 			threadpool.get(),
2250 			reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2251 			static_cast<void*>(counters.data()),
2252 			kDefaultUArchIndex, kMaxUArchIndex,
2253 			kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2254 			kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2255 			0 /* flags */);
2256 	}
2257 
2258 	for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2259 		for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2260 			const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2261 			EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2262 				<< "Element (" << i << ", " << j << ") was processed "
2263 				<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
2264 				<< "(expected: " << kIncrementIterations << ")";
2265 		}
2266 	}
2267 }
2268 
IncrementSame2DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2269 static void IncrementSame2DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2270 	for (size_t i = start_i; i < start_i + tile_i; i++) {
2271 		for (size_t j = start_j; j < start_j + tile_j; j++) {
2272 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
2273 		}
2274 	}
2275 }
2276 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolHighContention)2277 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolHighContention) {
2278 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2279 
2280 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2281 	ASSERT_TRUE(threadpool.get());
2282 
2283 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2284 		GTEST_SKIP();
2285 	}
2286 
2287 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2288 		threadpool.get(),
2289 		reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(IncrementSame2DTile2DWithUArch),
2290 		static_cast<void*>(&num_processed_items),
2291 		kDefaultUArchIndex, kMaxUArchIndex,
2292 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2293 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2294 		0 /* flags */);
2295 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2296 }
2297 
WorkImbalance2DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2298 static void WorkImbalance2DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2299 	num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed);
2300 	if (start_i == 0 && start_j == 0) {
2301 		/* Spin-wait until all items are computed */
2302 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) {
2303 			std::atomic_thread_fence(std::memory_order_acquire);
2304 		}
2305 	}
2306 }
2307 
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolWorkStealing)2308 TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolWorkStealing) {
2309 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2310 
2311 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2312 	ASSERT_TRUE(threadpool.get());
2313 
2314 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2315 		GTEST_SKIP();
2316 	}
2317 
2318 	pthreadpool_parallelize_2d_tile_2d_with_uarch(
2319 		threadpool.get(),
2320 		reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(WorkImbalance2DTile2DWithUArch),
2321 		static_cast<void*>(&num_processed_items),
2322 		kDefaultUArchIndex, kMaxUArchIndex,
2323 		kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2324 		kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2325 		0 /* flags */);
2326 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2327 }
2328 
ComputeNothing3D(void *,size_t,size_t,size_t)2329 static void ComputeNothing3D(void*, size_t, size_t, size_t) {
2330 }
2331 
TEST(Parallelize3D,SingleThreadPoolCompletes)2332 TEST(Parallelize3D, SingleThreadPoolCompletes) {
2333 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2334 	ASSERT_TRUE(threadpool.get());
2335 
2336 	pthreadpool_parallelize_3d(threadpool.get(),
2337 		ComputeNothing3D,
2338 		nullptr,
2339 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2340 		0 /* flags */);
2341 }
2342 
TEST(Parallelize3D,MultiThreadPoolCompletes)2343 TEST(Parallelize3D, MultiThreadPoolCompletes) {
2344 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2345 	ASSERT_TRUE(threadpool.get());
2346 
2347 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2348 		GTEST_SKIP();
2349 	}
2350 
2351 	pthreadpool_parallelize_3d(
2352 		threadpool.get(),
2353 		ComputeNothing3D,
2354 		nullptr,
2355 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2356 		0 /* flags */);
2357 }
2358 
CheckBounds3D(void *,size_t i,size_t j,size_t k)2359 static void CheckBounds3D(void*, size_t i, size_t j, size_t k) {
2360 	EXPECT_LT(i, kParallelize3DRangeI);
2361 	EXPECT_LT(j, kParallelize3DRangeJ);
2362 	EXPECT_LT(k, kParallelize3DRangeK);
2363 }
2364 
TEST(Parallelize3D,SingleThreadPoolAllItemsInBounds)2365 TEST(Parallelize3D, SingleThreadPoolAllItemsInBounds) {
2366 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2367 	ASSERT_TRUE(threadpool.get());
2368 
2369 	pthreadpool_parallelize_3d(
2370 		threadpool.get(),
2371 		CheckBounds3D,
2372 		nullptr,
2373 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2374 		0 /* flags */);
2375 }
2376 
TEST(Parallelize3D,MultiThreadPoolAllItemsInBounds)2377 TEST(Parallelize3D, MultiThreadPoolAllItemsInBounds) {
2378 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2379 	ASSERT_TRUE(threadpool.get());
2380 
2381 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2382 		GTEST_SKIP();
2383 	}
2384 
2385 	pthreadpool_parallelize_3d(
2386 		threadpool.get(),
2387 		CheckBounds3D,
2388 		nullptr,
2389 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2390 		0 /* flags */);
2391 }
2392 
SetTrue3D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k)2393 static void SetTrue3D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k) {
2394 	const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2395 	processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
2396 }
2397 
TEST(Parallelize3D,SingleThreadPoolAllItemsProcessed)2398 TEST(Parallelize3D, SingleThreadPoolAllItemsProcessed) {
2399 	std::vector<std::atomic_bool> indicators(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2400 
2401 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2402 	ASSERT_TRUE(threadpool.get());
2403 
2404 	pthreadpool_parallelize_3d(
2405 		threadpool.get(),
2406 		reinterpret_cast<pthreadpool_task_3d_t>(SetTrue3D),
2407 		static_cast<void*>(indicators.data()),
2408 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2409 		0 /* flags */);
2410 
2411 	for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2412 		for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2413 			for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2414 				const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2415 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2416 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
2417 			}
2418 		}
2419 	}
2420 }
2421 
TEST(Parallelize3D,MultiThreadPoolAllItemsProcessed)2422 TEST(Parallelize3D, MultiThreadPoolAllItemsProcessed) {
2423 	std::vector<std::atomic_bool> indicators(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2424 
2425 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2426 	ASSERT_TRUE(threadpool.get());
2427 
2428 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2429 		GTEST_SKIP();
2430 	}
2431 
2432 	pthreadpool_parallelize_3d(
2433 		threadpool.get(),
2434 		reinterpret_cast<pthreadpool_task_3d_t>(SetTrue3D),
2435 		static_cast<void*>(indicators.data()),
2436 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2437 		0 /* flags */);
2438 
2439 	for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2440 		for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2441 			for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2442 				const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2443 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2444 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
2445 			}
2446 		}
2447 	}
2448 }
2449 
Increment3D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k)2450 static void Increment3D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k) {
2451 	const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2452 	processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
2453 }
2454 
TEST(Parallelize3D,SingleThreadPoolEachItemProcessedOnce)2455 TEST(Parallelize3D, SingleThreadPoolEachItemProcessedOnce) {
2456 	std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2457 
2458 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2459 	ASSERT_TRUE(threadpool.get());
2460 
2461 	pthreadpool_parallelize_3d(
2462 		threadpool.get(),
2463 		reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2464 		static_cast<void*>(counters.data()),
2465 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2466 		0 /* flags */);
2467 
2468 	for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2469 		for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2470 			for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2471 				const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2472 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2473 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2474 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2475 			}
2476 		}
2477 	}
2478 }
2479 
TEST(Parallelize3D,MultiThreadPoolEachItemProcessedOnce)2480 TEST(Parallelize3D, MultiThreadPoolEachItemProcessedOnce) {
2481 	std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2482 
2483 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2484 	ASSERT_TRUE(threadpool.get());
2485 
2486 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2487 		GTEST_SKIP();
2488 	}
2489 
2490 	pthreadpool_parallelize_3d(
2491 		threadpool.get(),
2492 		reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2493 		static_cast<void*>(counters.data()),
2494 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2495 		0 /* flags */);
2496 
2497 	for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2498 		for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2499 			for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2500 				const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2501 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2502 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2503 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2504 			}
2505 		}
2506 	}
2507 }
2508 
TEST(Parallelize3D,SingleThreadPoolEachItemProcessedMultipleTimes)2509 TEST(Parallelize3D, SingleThreadPoolEachItemProcessedMultipleTimes) {
2510 	std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2511 
2512 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2513 	ASSERT_TRUE(threadpool.get());
2514 
2515 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2516 		pthreadpool_parallelize_3d(
2517 			threadpool.get(),
2518 			reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2519 			static_cast<void*>(counters.data()),
2520 			kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2521 				0 /* flags */);
2522 	}
2523 
2524 	for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2525 		for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2526 			for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2527 				const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2528 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2529 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2530 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
2531 					<< "(expected: " << kIncrementIterations << ")";
2532 			}
2533 		}
2534 	}
2535 }
2536 
TEST(Parallelize3D,MultiThreadPoolEachItemProcessedMultipleTimes)2537 TEST(Parallelize3D, MultiThreadPoolEachItemProcessedMultipleTimes) {
2538 	std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2539 
2540 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2541 	ASSERT_TRUE(threadpool.get());
2542 
2543 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2544 		GTEST_SKIP();
2545 	}
2546 
2547 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2548 		pthreadpool_parallelize_3d(
2549 			threadpool.get(),
2550 			reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2551 			static_cast<void*>(counters.data()),
2552 			kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2553 				0 /* flags */);
2554 	}
2555 
2556 	for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2557 		for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2558 			for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2559 				const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2560 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2561 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2562 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
2563 					<< "(expected: " << kIncrementIterations << ")";
2564 			}
2565 		}
2566 	}
2567 }
2568 
IncrementSame3D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k)2569 static void IncrementSame3D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k) {
2570 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
2571 }
2572 
TEST(Parallelize3D,MultiThreadPoolHighContention)2573 TEST(Parallelize3D, MultiThreadPoolHighContention) {
2574 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2575 
2576 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2577 	ASSERT_TRUE(threadpool.get());
2578 
2579 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2580 		GTEST_SKIP();
2581 	}
2582 
2583 	pthreadpool_parallelize_3d(
2584 		threadpool.get(),
2585 		reinterpret_cast<pthreadpool_task_3d_t>(IncrementSame3D),
2586 		static_cast<void*>(&num_processed_items),
2587 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2588 		0 /* flags */);
2589 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2590 }
2591 
WorkImbalance3D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k)2592 static void WorkImbalance3D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k) {
2593 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
2594 	if (i == 0 && j == 0 && k == 0) {
2595 		/* Spin-wait until all items are computed */
2596 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK) {
2597 			std::atomic_thread_fence(std::memory_order_acquire);
2598 		}
2599 	}
2600 }
2601 
TEST(Parallelize3D,MultiThreadPoolWorkStealing)2602 TEST(Parallelize3D, MultiThreadPoolWorkStealing) {
2603 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2604 
2605 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2606 	ASSERT_TRUE(threadpool.get());
2607 
2608 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2609 		GTEST_SKIP();
2610 	}
2611 
2612 	pthreadpool_parallelize_3d(
2613 		threadpool.get(),
2614 		reinterpret_cast<pthreadpool_task_3d_t>(WorkImbalance3D),
2615 		static_cast<void*>(&num_processed_items),
2616 		kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2617 		0 /* flags */);
2618 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2619 }
2620 
ComputeNothing3DTile1D(void *,size_t,size_t,size_t,size_t)2621 static void ComputeNothing3DTile1D(void*, size_t, size_t, size_t, size_t) {
2622 }
2623 
TEST(Parallelize3DTile1D,SingleThreadPoolCompletes)2624 TEST(Parallelize3DTile1D, SingleThreadPoolCompletes) {
2625 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2626 	ASSERT_TRUE(threadpool.get());
2627 
2628 	pthreadpool_parallelize_3d_tile_1d(threadpool.get(),
2629 		ComputeNothing3DTile1D,
2630 		nullptr,
2631 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2632 		kParallelize3DTile1DTileK,
2633 		0 /* flags */);
2634 }
2635 
TEST(Parallelize3DTile1D,MultiThreadPoolCompletes)2636 TEST(Parallelize3DTile1D, MultiThreadPoolCompletes) {
2637 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2638 	ASSERT_TRUE(threadpool.get());
2639 
2640 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2641 		GTEST_SKIP();
2642 	}
2643 
2644 	pthreadpool_parallelize_3d_tile_1d(
2645 		threadpool.get(),
2646 		ComputeNothing3DTile1D,
2647 		nullptr,
2648 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2649 		kParallelize3DTile1DTileK,
2650 		0 /* flags */);
2651 }
2652 
CheckBounds3DTile1D(void *,size_t i,size_t j,size_t start_k,size_t tile_k)2653 static void CheckBounds3DTile1D(void*, size_t i, size_t j, size_t start_k, size_t tile_k) {
2654 	EXPECT_LT(i, kParallelize3DTile1DRangeI);
2655 	EXPECT_LT(j, kParallelize3DTile1DRangeJ);
2656 	EXPECT_LT(start_k, kParallelize3DTile1DRangeK);
2657 	EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK);
2658 }
2659 
TEST(Parallelize3DTile1D,SingleThreadPoolAllItemsInBounds)2660 TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsInBounds) {
2661 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2662 	ASSERT_TRUE(threadpool.get());
2663 
2664 	pthreadpool_parallelize_3d_tile_1d(
2665 		threadpool.get(),
2666 		CheckBounds3DTile1D,
2667 		nullptr,
2668 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2669 		kParallelize3DTile1DTileK,
2670 		0 /* flags */);
2671 }
2672 
TEST(Parallelize3DTile1D,MultiThreadPoolAllItemsInBounds)2673 TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsInBounds) {
2674 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2675 	ASSERT_TRUE(threadpool.get());
2676 
2677 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2678 		GTEST_SKIP();
2679 	}
2680 
2681 	pthreadpool_parallelize_3d_tile_1d(
2682 		threadpool.get(),
2683 		CheckBounds3DTile1D,
2684 		nullptr,
2685 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2686 		kParallelize3DTile1DTileK,
2687 		0 /* flags */);
2688 }
2689 
CheckTiling3DTile1D(void *,size_t i,size_t j,size_t start_k,size_t tile_k)2690 static void CheckTiling3DTile1D(void*, size_t i, size_t j, size_t start_k, size_t tile_k) {
2691 	EXPECT_GT(tile_k, 0);
2692 	EXPECT_LE(tile_k, kParallelize3DTile1DTileK);
2693 	EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0);
2694 	EXPECT_EQ(tile_k, std::min<size_t>(kParallelize3DTile1DTileK, kParallelize3DTile1DRangeK - start_k));
2695 }
2696 
TEST(Parallelize3DTile1D,SingleThreadPoolUniformTiling)2697 TEST(Parallelize3DTile1D, SingleThreadPoolUniformTiling) {
2698 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2699 	ASSERT_TRUE(threadpool.get());
2700 
2701 	pthreadpool_parallelize_3d_tile_1d(
2702 		threadpool.get(),
2703 		CheckTiling3DTile1D,
2704 		nullptr,
2705 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2706 		kParallelize3DTile1DTileK,
2707 		0 /* flags */);
2708 }
2709 
TEST(Parallelize3DTile1D,MultiThreadPoolUniformTiling)2710 TEST(Parallelize3DTile1D, MultiThreadPoolUniformTiling) {
2711 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2712 	ASSERT_TRUE(threadpool.get());
2713 
2714 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2715 		GTEST_SKIP();
2716 	}
2717 
2718 	pthreadpool_parallelize_3d_tile_1d(
2719 		threadpool.get(),
2720 		CheckTiling3DTile1D,
2721 		nullptr,
2722 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2723 		kParallelize3DTile1DTileK,
2724 		0 /* flags */);
2725 }
2726 
SetTrue3DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t start_k,size_t tile_k)2727 static void SetTrue3DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t start_k, size_t tile_k) {
2728 	for (size_t k = start_k; k < start_k + tile_k; k++) {
2729 		const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2730 		processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
2731 	}
2732 }
2733 
TEST(Parallelize3DTile1D,SingleThreadPoolAllItemsProcessed)2734 TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsProcessed) {
2735 	std::vector<std::atomic_bool> indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2736 
2737 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2738 	ASSERT_TRUE(threadpool.get());
2739 
2740 	pthreadpool_parallelize_3d_tile_1d(
2741 		threadpool.get(),
2742 		reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(SetTrue3DTile1D),
2743 		static_cast<void*>(indicators.data()),
2744 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2745 		kParallelize3DTile1DTileK,
2746 		0 /* flags */);
2747 
2748 	for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2749 		for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2750 			for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2751 				const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2752 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2753 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
2754 			}
2755 		}
2756 	}
2757 }
2758 
TEST(Parallelize3DTile1D,MultiThreadPoolAllItemsProcessed)2759 TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsProcessed) {
2760 	std::vector<std::atomic_bool> indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2761 
2762 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2763 	ASSERT_TRUE(threadpool.get());
2764 
2765 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2766 		GTEST_SKIP();
2767 	}
2768 
2769 	pthreadpool_parallelize_3d_tile_1d(
2770 		threadpool.get(),
2771 		reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(SetTrue3DTile1D),
2772 		static_cast<void*>(indicators.data()),
2773 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2774 		kParallelize3DTile1DTileK,
2775 		0 /* flags */);
2776 
2777 	for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2778 		for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2779 			for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2780 				const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2781 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2782 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
2783 			}
2784 		}
2785 	}
2786 }
2787 
Increment3DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t start_k,size_t tile_k)2788 static void Increment3DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t start_k, size_t tile_k) {
2789 	for (size_t k = start_k; k < start_k + tile_k; k++) {
2790 		const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2791 		processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
2792 	}
2793 }
2794 
TEST(Parallelize3DTile1D,SingleThreadPoolEachItemProcessedOnce)2795 TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedOnce) {
2796 	std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2797 
2798 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2799 	ASSERT_TRUE(threadpool.get());
2800 
2801 	pthreadpool_parallelize_3d_tile_1d(
2802 		threadpool.get(),
2803 		reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2804 		static_cast<void*>(counters.data()),
2805 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2806 		kParallelize3DTile1DTileK,
2807 		0 /* flags */);
2808 
2809 	for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2810 		for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2811 			for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2812 				const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2813 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2814 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2815 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2816 			}
2817 		}
2818 	}
2819 }
2820 
TEST(Parallelize3DTile1D,MultiThreadPoolEachItemProcessedOnce)2821 TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedOnce) {
2822 	std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2823 
2824 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2825 	ASSERT_TRUE(threadpool.get());
2826 
2827 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2828 		GTEST_SKIP();
2829 	}
2830 
2831 	pthreadpool_parallelize_3d_tile_1d(
2832 		threadpool.get(),
2833 		reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2834 		static_cast<void*>(counters.data()),
2835 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2836 		kParallelize3DTile1DTileK,
2837 		0 /* flags */);
2838 
2839 	for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2840 		for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2841 			for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2842 				const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2843 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2844 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2845 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2846 			}
2847 		}
2848 	}
2849 }
2850 
TEST(Parallelize3DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)2851 TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
2852 	std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2853 
2854 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2855 	ASSERT_TRUE(threadpool.get());
2856 
2857 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2858 		pthreadpool_parallelize_3d_tile_1d(
2859 			threadpool.get(),
2860 			reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2861 			static_cast<void*>(counters.data()),
2862 			kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2863 			kParallelize3DTile1DTileK,
2864 			0 /* flags */);
2865 	}
2866 
2867 	for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2868 		for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2869 			for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2870 				const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2871 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2872 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2873 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
2874 					<< "(expected: " << kIncrementIterations << ")";
2875 			}
2876 		}
2877 	}
2878 }
2879 
TEST(Parallelize3DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)2880 TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
2881 	std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2882 
2883 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2884 	ASSERT_TRUE(threadpool.get());
2885 
2886 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2887 		GTEST_SKIP();
2888 	}
2889 
2890 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2891 		pthreadpool_parallelize_3d_tile_1d(
2892 			threadpool.get(),
2893 			reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2894 			static_cast<void*>(counters.data()),
2895 			kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2896 			kParallelize3DTile1DTileK,
2897 			0 /* flags */);
2898 	}
2899 
2900 	for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2901 		for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2902 			for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2903 				const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2904 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2905 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
2906 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
2907 					<< "(expected: " << kIncrementIterations << ")";
2908 			}
2909 		}
2910 	}
2911 }
2912 
IncrementSame3DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t tile_k)2913 static void IncrementSame3DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t tile_k) {
2914 	for (size_t k = start_k; k < start_k + tile_k; k++) {
2915 		num_processed_items->fetch_add(1, std::memory_order_relaxed);
2916 	}
2917 }
2918 
TEST(Parallelize3DTile1D,MultiThreadPoolHighContention)2919 TEST(Parallelize3DTile1D, MultiThreadPoolHighContention) {
2920 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2921 
2922 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2923 	ASSERT_TRUE(threadpool.get());
2924 
2925 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2926 		GTEST_SKIP();
2927 	}
2928 
2929 	pthreadpool_parallelize_3d_tile_1d(
2930 		threadpool.get(),
2931 		reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(IncrementSame3DTile1D),
2932 		static_cast<void*>(&num_processed_items),
2933 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2934 		kParallelize3DTile1DTileK,
2935 		0 /* flags */);
2936 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2937 }
2938 
WorkImbalance3DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t tile_k)2939 static void WorkImbalance3DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t tile_k) {
2940 	num_processed_items->fetch_add(tile_k, std::memory_order_relaxed);
2941 	if (i == 0 && j == 0 && start_k == 0) {
2942 		/* Spin-wait until all items are computed */
2943 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK) {
2944 			std::atomic_thread_fence(std::memory_order_acquire);
2945 		}
2946 	}
2947 }
2948 
TEST(Parallelize3DTile1D,MultiThreadPoolWorkStealing)2949 TEST(Parallelize3DTile1D, MultiThreadPoolWorkStealing) {
2950 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2951 
2952 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2953 	ASSERT_TRUE(threadpool.get());
2954 
2955 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2956 		GTEST_SKIP();
2957 	}
2958 
2959 	pthreadpool_parallelize_3d_tile_1d(
2960 		threadpool.get(),
2961 		reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(WorkImbalance3DTile1D),
2962 		static_cast<void*>(&num_processed_items),
2963 		kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2964 		kParallelize3DTile1DTileK,
2965 		0 /* flags */);
2966 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2967 }
2968 
ComputeNothing3DTile2D(void *,size_t,size_t,size_t,size_t,size_t)2969 static void ComputeNothing3DTile2D(void*, size_t, size_t, size_t, size_t, size_t) {
2970 }
2971 
TEST(Parallelize3DTile2D,SingleThreadPoolCompletes)2972 TEST(Parallelize3DTile2D, SingleThreadPoolCompletes) {
2973 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2974 	ASSERT_TRUE(threadpool.get());
2975 
2976 	pthreadpool_parallelize_3d_tile_2d(threadpool.get(),
2977 		ComputeNothing3DTile2D,
2978 		nullptr,
2979 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
2980 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
2981 		0 /* flags */);
2982 }
2983 
TEST(Parallelize3DTile2D,MultiThreadPoolCompletes)2984 TEST(Parallelize3DTile2D, MultiThreadPoolCompletes) {
2985 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2986 	ASSERT_TRUE(threadpool.get());
2987 
2988 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2989 		GTEST_SKIP();
2990 	}
2991 
2992 	pthreadpool_parallelize_3d_tile_2d(
2993 		threadpool.get(),
2994 		ComputeNothing3DTile2D,
2995 		nullptr,
2996 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
2997 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
2998 		0 /* flags */);
2999 }
3000 
CheckBounds3DTile2D(void *,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3001 static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3002 	EXPECT_LT(i, kParallelize3DTile2DRangeI);
3003 	EXPECT_LT(start_j, kParallelize3DTile2DRangeJ);
3004 	EXPECT_LT(start_k, kParallelize3DTile2DRangeK);
3005 	EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ);
3006 	EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK);
3007 }
3008 
TEST(Parallelize3DTile2D,SingleThreadPoolAllItemsInBounds)3009 TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsInBounds) {
3010 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3011 	ASSERT_TRUE(threadpool.get());
3012 
3013 	pthreadpool_parallelize_3d_tile_2d(
3014 		threadpool.get(),
3015 		CheckBounds3DTile2D,
3016 		nullptr,
3017 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3018 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3019 		0 /* flags */);
3020 }
3021 
TEST(Parallelize3DTile2D,MultiThreadPoolAllItemsInBounds)3022 TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsInBounds) {
3023 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3024 	ASSERT_TRUE(threadpool.get());
3025 
3026 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3027 		GTEST_SKIP();
3028 	}
3029 
3030 	pthreadpool_parallelize_3d_tile_2d(
3031 		threadpool.get(),
3032 		CheckBounds3DTile2D,
3033 		nullptr,
3034 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3035 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3036 		0 /* flags */);
3037 }
3038 
CheckTiling3DTile2D(void *,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3039 static void CheckTiling3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3040 	EXPECT_GT(tile_j, 0);
3041 	EXPECT_LE(tile_j, kParallelize3DTile2DTileJ);
3042 	EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0);
3043 	EXPECT_EQ(tile_j, std::min<size_t>(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j));
3044 
3045 	EXPECT_GT(tile_k, 0);
3046 	EXPECT_LE(tile_k, kParallelize3DTile2DTileK);
3047 	EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0);
3048 	EXPECT_EQ(tile_k, std::min<size_t>(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k));
3049 }
3050 
TEST(Parallelize3DTile2D,SingleThreadPoolUniformTiling)3051 TEST(Parallelize3DTile2D, SingleThreadPoolUniformTiling) {
3052 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3053 	ASSERT_TRUE(threadpool.get());
3054 
3055 	pthreadpool_parallelize_3d_tile_2d(
3056 		threadpool.get(),
3057 		CheckTiling3DTile2D,
3058 		nullptr,
3059 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3060 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3061 		0 /* flags */);
3062 }
3063 
TEST(Parallelize3DTile2D,MultiThreadPoolUniformTiling)3064 TEST(Parallelize3DTile2D, MultiThreadPoolUniformTiling) {
3065 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3066 	ASSERT_TRUE(threadpool.get());
3067 
3068 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3069 		GTEST_SKIP();
3070 	}
3071 
3072 	pthreadpool_parallelize_3d_tile_2d(
3073 		threadpool.get(),
3074 		CheckTiling3DTile2D,
3075 		nullptr,
3076 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3077 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3078 		0 /* flags */);
3079 }
3080 
SetTrue3DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3081 static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3082 	for (size_t j = start_j; j < start_j + tile_j; j++) {
3083 		for (size_t k = start_k; k < start_k + tile_k; k++) {
3084 			const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3085 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
3086 		}
3087 	}
3088 }
3089 
TEST(Parallelize3DTile2D,SingleThreadPoolAllItemsProcessed)3090 TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) {
3091 	std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3092 
3093 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3094 	ASSERT_TRUE(threadpool.get());
3095 
3096 	pthreadpool_parallelize_3d_tile_2d(
3097 		threadpool.get(),
3098 		reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(SetTrue3DTile2D),
3099 		static_cast<void*>(indicators.data()),
3100 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3101 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3102 		0 /* flags */);
3103 
3104 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3105 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3106 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3107 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3108 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3109 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
3110 			}
3111 		}
3112 	}
3113 }
3114 
TEST(Parallelize3DTile2D,MultiThreadPoolAllItemsProcessed)3115 TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) {
3116 	std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3117 
3118 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3119 	ASSERT_TRUE(threadpool.get());
3120 
3121 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3122 		GTEST_SKIP();
3123 	}
3124 
3125 	pthreadpool_parallelize_3d_tile_2d(
3126 		threadpool.get(),
3127 		reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(SetTrue3DTile2D),
3128 		static_cast<void*>(indicators.data()),
3129 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3130 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3131 		0 /* flags */);
3132 
3133 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3134 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3135 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3136 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3137 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3138 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
3139 			}
3140 		}
3141 	}
3142 }
3143 
Increment3DTile2D(std::atomic_int * processed_counters,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3144 static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3145 	for (size_t j = start_j; j < start_j + tile_j; j++) {
3146 		for (size_t k = start_k; k < start_k + tile_k; k++) {
3147 			const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3148 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
3149 		}
3150 	}
3151 }
3152 
TEST(Parallelize3DTile2D,SingleThreadPoolEachItemProcessedOnce)3153 TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) {
3154 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3155 
3156 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3157 	ASSERT_TRUE(threadpool.get());
3158 
3159 	pthreadpool_parallelize_3d_tile_2d(
3160 		threadpool.get(),
3161 		reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3162 		static_cast<void*>(counters.data()),
3163 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3164 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3165 		0 /* flags */);
3166 
3167 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3168 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3169 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3170 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3171 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3172 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3173 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3174 			}
3175 		}
3176 	}
3177 }
3178 
TEST(Parallelize3DTile2D,MultiThreadPoolEachItemProcessedOnce)3179 TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) {
3180 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3181 
3182 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3183 	ASSERT_TRUE(threadpool.get());
3184 
3185 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3186 		GTEST_SKIP();
3187 	}
3188 
3189 	pthreadpool_parallelize_3d_tile_2d(
3190 		threadpool.get(),
3191 		reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3192 		static_cast<void*>(counters.data()),
3193 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3194 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3195 		0 /* flags */);
3196 
3197 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3198 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3199 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3200 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3201 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3202 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3203 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3204 			}
3205 		}
3206 	}
3207 }
3208 
TEST(Parallelize3DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)3209 TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
3210 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3211 
3212 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3213 	ASSERT_TRUE(threadpool.get());
3214 
3215 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3216 		pthreadpool_parallelize_3d_tile_2d(
3217 			threadpool.get(),
3218 			reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3219 			static_cast<void*>(counters.data()),
3220 			kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3221 			kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3222 			0 /* flags */);
3223 	}
3224 
3225 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3226 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3227 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3228 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3229 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3230 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3231 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
3232 					<< "(expected: " << kIncrementIterations << ")";
3233 			}
3234 		}
3235 	}
3236 }
3237 
TEST(Parallelize3DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)3238 TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
3239 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3240 
3241 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3242 	ASSERT_TRUE(threadpool.get());
3243 
3244 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3245 		GTEST_SKIP();
3246 	}
3247 
3248 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3249 		pthreadpool_parallelize_3d_tile_2d(
3250 			threadpool.get(),
3251 			reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3252 			static_cast<void*>(counters.data()),
3253 			kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3254 			kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3255 			0 /* flags */);
3256 	}
3257 
3258 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3259 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3260 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3261 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3262 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3263 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3264 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
3265 					<< "(expected: " << kIncrementIterations << ")";
3266 			}
3267 		}
3268 	}
3269 }
3270 
IncrementSame3DTile2D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3271 static void IncrementSame3DTile2D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3272 	for (size_t j = start_j; j < start_j + tile_j; j++) {
3273 		for (size_t k = start_k; k < start_k + tile_k; k++) {
3274 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
3275 		}
3276 	}
3277 }
3278 
TEST(Parallelize3DTile2D,MultiThreadPoolHighContention)3279 TEST(Parallelize3DTile2D, MultiThreadPoolHighContention) {
3280 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3281 
3282 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3283 	ASSERT_TRUE(threadpool.get());
3284 
3285 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3286 		GTEST_SKIP();
3287 	}
3288 
3289 	pthreadpool_parallelize_3d_tile_2d(
3290 		threadpool.get(),
3291 		reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(IncrementSame3DTile2D),
3292 		static_cast<void*>(&num_processed_items),
3293 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3294 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3295 		0 /* flags */);
3296 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3297 }
3298 
WorkImbalance3DTile2D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3299 static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3300 	num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed);
3301 	if (i == 0 && start_j == 0 && start_k == 0) {
3302 		/* Spin-wait until all items are computed */
3303 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK) {
3304 			std::atomic_thread_fence(std::memory_order_acquire);
3305 		}
3306 	}
3307 }
3308 
TEST(Parallelize3DTile2D,MultiThreadPoolWorkStealing)3309 TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) {
3310 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3311 
3312 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3313 	ASSERT_TRUE(threadpool.get());
3314 
3315 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3316 		GTEST_SKIP();
3317 	}
3318 
3319 	pthreadpool_parallelize_3d_tile_2d(
3320 		threadpool.get(),
3321 		reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(WorkImbalance3DTile2D),
3322 		static_cast<void*>(&num_processed_items),
3323 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3324 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3325 		0 /* flags */);
3326 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3327 }
3328 
ComputeNothing3DTile2DWithUArch(void *,uint32_t,size_t,size_t,size_t,size_t,size_t)3329 static void ComputeNothing3DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t, size_t) {
3330 }
3331 
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolCompletes)3332 TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolCompletes) {
3333 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3334 	ASSERT_TRUE(threadpool.get());
3335 
3336 	pthreadpool_parallelize_3d_tile_2d_with_uarch(threadpool.get(),
3337 		ComputeNothing3DTile2DWithUArch,
3338 		nullptr,
3339 		kDefaultUArchIndex, kMaxUArchIndex,
3340 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3341 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3342 		0 /* flags */);
3343 }
3344 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolCompletes)3345 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolCompletes) {
3346 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3347 	ASSERT_TRUE(threadpool.get());
3348 
3349 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3350 		GTEST_SKIP();
3351 	}
3352 
3353 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3354 		threadpool.get(),
3355 		ComputeNothing3DTile2DWithUArch,
3356 		nullptr,
3357 		kDefaultUArchIndex, kMaxUArchIndex,
3358 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3359 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3360 		0 /* flags */);
3361 }
3362 
CheckUArch3DTile2DWithUArch(void *,uint32_t uarch_index,size_t,size_t,size_t,size_t,size_t)3363 static void CheckUArch3DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t, size_t) {
3364 	if (uarch_index != kDefaultUArchIndex) {
3365 		EXPECT_LE(uarch_index, kMaxUArchIndex);
3366 	}
3367 }
3368 
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolUArchInBounds)3369 TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolUArchInBounds) {
3370 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3371 	ASSERT_TRUE(threadpool.get());
3372 
3373 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3374 		threadpool.get(),
3375 		CheckUArch3DTile2DWithUArch,
3376 		nullptr,
3377 		kDefaultUArchIndex, kMaxUArchIndex,
3378 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3379 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3380 		0 /* flags */);
3381 }
3382 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolUArchInBounds)3383 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolUArchInBounds) {
3384 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3385 	ASSERT_TRUE(threadpool.get());
3386 
3387 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3388 		GTEST_SKIP();
3389 	}
3390 
3391 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3392 		threadpool.get(),
3393 		CheckUArch3DTile2DWithUArch,
3394 		nullptr,
3395 		kDefaultUArchIndex, kMaxUArchIndex,
3396 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3397 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3398 		0 /* flags */);
3399 }
3400 
CheckBounds3DTile2DWithUArch(void *,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3401 static void CheckBounds3DTile2DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3402 	EXPECT_LT(i, kParallelize3DTile2DRangeI);
3403 	EXPECT_LT(start_j, kParallelize3DTile2DRangeJ);
3404 	EXPECT_LT(start_k, kParallelize3DTile2DRangeK);
3405 	EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ);
3406 	EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK);
3407 }
3408 
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolAllItemsInBounds)3409 TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) {
3410 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3411 	ASSERT_TRUE(threadpool.get());
3412 
3413 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3414 		threadpool.get(),
3415 		CheckBounds3DTile2DWithUArch,
3416 		nullptr,
3417 		kDefaultUArchIndex, kMaxUArchIndex,
3418 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3419 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3420 		0 /* flags */);
3421 }
3422 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolAllItemsInBounds)3423 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) {
3424 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3425 	ASSERT_TRUE(threadpool.get());
3426 
3427 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3428 		GTEST_SKIP();
3429 	}
3430 
3431 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3432 		threadpool.get(),
3433 		CheckBounds3DTile2DWithUArch,
3434 		nullptr,
3435 		kDefaultUArchIndex, kMaxUArchIndex,
3436 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3437 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3438 		0 /* flags */);
3439 }
3440 
CheckTiling3DTile2DWithUArch(void *,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3441 static void CheckTiling3DTile2DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3442 	EXPECT_GT(tile_j, 0);
3443 	EXPECT_LE(tile_j, kParallelize3DTile2DTileJ);
3444 	EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0);
3445 	EXPECT_EQ(tile_j, std::min<size_t>(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j));
3446 
3447 	EXPECT_GT(tile_k, 0);
3448 	EXPECT_LE(tile_k, kParallelize3DTile2DTileK);
3449 	EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0);
3450 	EXPECT_EQ(tile_k, std::min<size_t>(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k));
3451 }
3452 
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolUniformTiling)3453 TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolUniformTiling) {
3454 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3455 	ASSERT_TRUE(threadpool.get());
3456 
3457 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3458 		threadpool.get(),
3459 		CheckTiling3DTile2DWithUArch,
3460 		nullptr,
3461 		kDefaultUArchIndex, kMaxUArchIndex,
3462 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3463 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3464 		0 /* flags */);
3465 }
3466 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolUniformTiling)3467 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolUniformTiling) {
3468 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3469 	ASSERT_TRUE(threadpool.get());
3470 
3471 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3472 		GTEST_SKIP();
3473 	}
3474 
3475 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3476 		threadpool.get(),
3477 		CheckTiling3DTile2DWithUArch,
3478 		nullptr,
3479 		kDefaultUArchIndex, kMaxUArchIndex,
3480 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3481 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3482 		0 /* flags */);
3483 }
3484 
SetTrue3DTile2DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3485 static void SetTrue3DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3486 	for (size_t j = start_j; j < start_j + tile_j; j++) {
3487 		for (size_t k = start_k; k < start_k + tile_k; k++) {
3488 			const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3489 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
3490 		}
3491 	}
3492 }
3493 
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolAllItemsProcessed)3494 TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) {
3495 	std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3496 
3497 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3498 	ASSERT_TRUE(threadpool.get());
3499 
3500 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3501 		threadpool.get(),
3502 		reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(SetTrue3DTile2DWithUArch),
3503 		static_cast<void*>(indicators.data()),
3504 		kDefaultUArchIndex, kMaxUArchIndex,
3505 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3506 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3507 		0 /* flags */);
3508 
3509 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3510 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3511 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3512 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3513 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3514 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
3515 			}
3516 		}
3517 	}
3518 }
3519 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolAllItemsProcessed)3520 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) {
3521 	std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3522 
3523 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3524 	ASSERT_TRUE(threadpool.get());
3525 
3526 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3527 		GTEST_SKIP();
3528 	}
3529 
3530 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3531 		threadpool.get(),
3532 		reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(SetTrue3DTile2DWithUArch),
3533 		static_cast<void*>(indicators.data()),
3534 		kDefaultUArchIndex, kMaxUArchIndex,
3535 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3536 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3537 		0 /* flags */);
3538 
3539 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3540 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3541 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3542 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3543 				EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3544 					<< "Element (" << i << ", " << j << ", " << k << ") not processed";
3545 			}
3546 		}
3547 	}
3548 }
3549 
Increment3DTile2DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3550 static void Increment3DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3551 	for (size_t j = start_j; j < start_j + tile_j; j++) {
3552 		for (size_t k = start_k; k < start_k + tile_k; k++) {
3553 			const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3554 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
3555 		}
3556 	}
3557 }
3558 
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolEachItemProcessedOnce)3559 TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
3560 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3561 
3562 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3563 	ASSERT_TRUE(threadpool.get());
3564 
3565 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3566 		threadpool.get(),
3567 		reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3568 		static_cast<void*>(counters.data()),
3569 		kDefaultUArchIndex, kMaxUArchIndex,
3570 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3571 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3572 		0 /* flags */);
3573 
3574 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3575 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3576 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3577 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3578 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3579 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3580 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3581 			}
3582 		}
3583 	}
3584 }
3585 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolEachItemProcessedOnce)3586 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
3587 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3588 
3589 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3590 	ASSERT_TRUE(threadpool.get());
3591 
3592 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3593 		GTEST_SKIP();
3594 	}
3595 
3596 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3597 		threadpool.get(),
3598 		reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3599 		static_cast<void*>(counters.data()),
3600 		kDefaultUArchIndex, kMaxUArchIndex,
3601 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3602 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3603 		0 /* flags */);
3604 
3605 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3606 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3607 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3608 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3609 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3610 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3611 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3612 			}
3613 		}
3614 	}
3615 }
3616 
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)3617 TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
3618 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3619 
3620 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3621 	ASSERT_TRUE(threadpool.get());
3622 
3623 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3624 		pthreadpool_parallelize_3d_tile_2d_with_uarch(
3625 			threadpool.get(),
3626 			reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3627 			static_cast<void*>(counters.data()),
3628 			kDefaultUArchIndex, kMaxUArchIndex,
3629 			kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3630 			kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3631 			0 /* flags */);
3632 	}
3633 
3634 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3635 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3636 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3637 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3638 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3639 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3640 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
3641 					<< "(expected: " << kIncrementIterations << ")";
3642 			}
3643 		}
3644 	}
3645 }
3646 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)3647 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
3648 	std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3649 
3650 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3651 	ASSERT_TRUE(threadpool.get());
3652 
3653 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3654 		GTEST_SKIP();
3655 	}
3656 
3657 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3658 		pthreadpool_parallelize_3d_tile_2d_with_uarch(
3659 			threadpool.get(),
3660 			reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3661 			static_cast<void*>(counters.data()),
3662 			kDefaultUArchIndex, kMaxUArchIndex,
3663 			kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3664 			kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3665 			0 /* flags */);
3666 	}
3667 
3668 	for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3669 		for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3670 			for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3671 				const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3672 				EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3673 					<< "Element (" << i << ", " << j << ", " << k << ") was processed "
3674 					<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
3675 					<< "(expected: " << kIncrementIterations << ")";
3676 			}
3677 		}
3678 	}
3679 }
3680 
IncrementSame3DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3681 static void IncrementSame3DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3682 	for (size_t j = start_j; j < start_j + tile_j; j++) {
3683 		for (size_t k = start_k; k < start_k + tile_k; k++) {
3684 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
3685 		}
3686 	}
3687 }
3688 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolHighContention)3689 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolHighContention) {
3690 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3691 
3692 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3693 	ASSERT_TRUE(threadpool.get());
3694 
3695 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3696 		GTEST_SKIP();
3697 	}
3698 
3699 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3700 		threadpool.get(),
3701 		reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(IncrementSame3DTile2DWithUArch),
3702 		static_cast<void*>(&num_processed_items),
3703 		kDefaultUArchIndex, kMaxUArchIndex,
3704 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3705 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3706 		0 /* flags */);
3707 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3708 }
3709 
WorkImbalance3DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3710 static void WorkImbalance3DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3711 	num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed);
3712 	if (i == 0 && start_j == 0 && start_k == 0) {
3713 		/* Spin-wait until all items are computed */
3714 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK) {
3715 			std::atomic_thread_fence(std::memory_order_acquire);
3716 		}
3717 	}
3718 }
3719 
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolWorkStealing)3720 TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolWorkStealing) {
3721 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3722 
3723 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3724 	ASSERT_TRUE(threadpool.get());
3725 
3726 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3727 		GTEST_SKIP();
3728 	}
3729 
3730 	pthreadpool_parallelize_3d_tile_2d_with_uarch(
3731 		threadpool.get(),
3732 		reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(WorkImbalance3DTile2DWithUArch),
3733 		static_cast<void*>(&num_processed_items),
3734 		kDefaultUArchIndex, kMaxUArchIndex,
3735 		kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3736 		kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3737 		0 /* flags */);
3738 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3739 }
3740 
ComputeNothing4D(void *,size_t,size_t,size_t,size_t)3741 static void ComputeNothing4D(void*, size_t, size_t, size_t, size_t) {
3742 }
3743 
TEST(Parallelize4D,SingleThreadPoolCompletes)3744 TEST(Parallelize4D, SingleThreadPoolCompletes) {
3745 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3746 	ASSERT_TRUE(threadpool.get());
3747 
3748 	pthreadpool_parallelize_4d(threadpool.get(),
3749 		ComputeNothing4D,
3750 		nullptr,
3751 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3752 		0 /* flags */);
3753 }
3754 
TEST(Parallelize4D,MultiThreadPoolCompletes)3755 TEST(Parallelize4D, MultiThreadPoolCompletes) {
3756 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3757 	ASSERT_TRUE(threadpool.get());
3758 
3759 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3760 		GTEST_SKIP();
3761 	}
3762 
3763 	pthreadpool_parallelize_4d(
3764 		threadpool.get(),
3765 		ComputeNothing4D,
3766 		nullptr,
3767 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3768 		0 /* flags */);
3769 }
3770 
CheckBounds4D(void *,size_t i,size_t j,size_t k,size_t l)3771 static void CheckBounds4D(void*, size_t i, size_t j, size_t k, size_t l) {
3772 	EXPECT_LT(i, kParallelize4DRangeI);
3773 	EXPECT_LT(j, kParallelize4DRangeJ);
3774 	EXPECT_LT(k, kParallelize4DRangeK);
3775 	EXPECT_LT(l, kParallelize4DRangeL);
3776 }
3777 
TEST(Parallelize4D,SingleThreadPoolAllItemsInBounds)3778 TEST(Parallelize4D, SingleThreadPoolAllItemsInBounds) {
3779 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3780 	ASSERT_TRUE(threadpool.get());
3781 
3782 	pthreadpool_parallelize_4d(
3783 		threadpool.get(),
3784 		CheckBounds4D,
3785 		nullptr,
3786 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3787 		0 /* flags */);
3788 }
3789 
TEST(Parallelize4D,MultiThreadPoolAllItemsInBounds)3790 TEST(Parallelize4D, MultiThreadPoolAllItemsInBounds) {
3791 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3792 	ASSERT_TRUE(threadpool.get());
3793 
3794 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3795 		GTEST_SKIP();
3796 	}
3797 
3798 	pthreadpool_parallelize_4d(
3799 		threadpool.get(),
3800 		CheckBounds4D,
3801 		nullptr,
3802 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3803 		0 /* flags */);
3804 }
3805 
SetTrue4D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l)3806 static void SetTrue4D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l) {
3807 	const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3808 	processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
3809 }
3810 
TEST(Parallelize4D,SingleThreadPoolAllItemsProcessed)3811 TEST(Parallelize4D, SingleThreadPoolAllItemsProcessed) {
3812 	std::vector<std::atomic_bool> indicators(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3813 
3814 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3815 	ASSERT_TRUE(threadpool.get());
3816 
3817 	pthreadpool_parallelize_4d(
3818 		threadpool.get(),
3819 		reinterpret_cast<pthreadpool_task_4d_t>(SetTrue4D),
3820 		static_cast<void*>(indicators.data()),
3821 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3822 		0 /* flags */);
3823 
3824 	for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3825 		for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3826 			for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3827 				for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3828 					const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3829 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3830 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
3831 				}
3832 			}
3833 		}
3834 	}
3835 }
3836 
TEST(Parallelize4D,MultiThreadPoolAllItemsProcessed)3837 TEST(Parallelize4D, MultiThreadPoolAllItemsProcessed) {
3838 	std::vector<std::atomic_bool> indicators(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3839 
3840 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3841 	ASSERT_TRUE(threadpool.get());
3842 
3843 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3844 		GTEST_SKIP();
3845 	}
3846 
3847 	pthreadpool_parallelize_4d(
3848 		threadpool.get(),
3849 		reinterpret_cast<pthreadpool_task_4d_t>(SetTrue4D),
3850 		static_cast<void*>(indicators.data()),
3851 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3852 		0 /* flags */);
3853 
3854 	for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3855 		for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3856 			for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3857 				for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3858 					const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3859 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3860 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
3861 				}
3862 			}
3863 		}
3864 	}
3865 }
3866 
Increment4D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l)3867 static void Increment4D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l) {
3868 	const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3869 	processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
3870 }
3871 
TEST(Parallelize4D,SingleThreadPoolEachItemProcessedOnce)3872 TEST(Parallelize4D, SingleThreadPoolEachItemProcessedOnce) {
3873 	std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3874 
3875 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3876 	ASSERT_TRUE(threadpool.get());
3877 
3878 	pthreadpool_parallelize_4d(
3879 		threadpool.get(),
3880 		reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3881 		static_cast<void*>(counters.data()),
3882 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3883 		0 /* flags */);
3884 
3885 	for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3886 		for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3887 			for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3888 				for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3889 					const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3890 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3891 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3892 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3893 				}
3894 			}
3895 		}
3896 	}
3897 }
3898 
TEST(Parallelize4D,MultiThreadPoolEachItemProcessedOnce)3899 TEST(Parallelize4D, MultiThreadPoolEachItemProcessedOnce) {
3900 	std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3901 
3902 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3903 	ASSERT_TRUE(threadpool.get());
3904 
3905 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3906 		GTEST_SKIP();
3907 	}
3908 
3909 	pthreadpool_parallelize_4d(
3910 		threadpool.get(),
3911 		reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3912 		static_cast<void*>(counters.data()),
3913 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3914 		0 /* flags */);
3915 
3916 	for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3917 		for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3918 			for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3919 				for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3920 					const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3921 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3922 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3923 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3924 				}
3925 			}
3926 		}
3927 	}
3928 }
3929 
TEST(Parallelize4D,SingleThreadPoolEachItemProcessedMultipleTimes)3930 TEST(Parallelize4D, SingleThreadPoolEachItemProcessedMultipleTimes) {
3931 	std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3932 
3933 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3934 	ASSERT_TRUE(threadpool.get());
3935 
3936 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3937 		pthreadpool_parallelize_4d(
3938 			threadpool.get(),
3939 			reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3940 			static_cast<void*>(counters.data()),
3941 			kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3942 				0 /* flags */);
3943 	}
3944 
3945 	for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3946 		for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3947 			for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3948 				for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3949 					const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3950 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3951 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3952 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
3953 						<< "(expected: " << kIncrementIterations << ")";
3954 				}
3955 			}
3956 		}
3957 	}
3958 }
3959 
TEST(Parallelize4D,MultiThreadPoolEachItemProcessedMultipleTimes)3960 TEST(Parallelize4D, MultiThreadPoolEachItemProcessedMultipleTimes) {
3961 	std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3962 
3963 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3964 	ASSERT_TRUE(threadpool.get());
3965 
3966 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3967 		GTEST_SKIP();
3968 	}
3969 
3970 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3971 		pthreadpool_parallelize_4d(
3972 			threadpool.get(),
3973 			reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3974 			static_cast<void*>(counters.data()),
3975 			kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3976 				0 /* flags */);
3977 	}
3978 
3979 	for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3980 		for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3981 			for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3982 				for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3983 					const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3984 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3985 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3986 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
3987 						<< "(expected: " << kIncrementIterations << ")";
3988 				}
3989 			}
3990 		}
3991 	}
3992 }
3993 
IncrementSame4D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l)3994 static void IncrementSame4D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l) {
3995 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
3996 }
3997 
TEST(Parallelize4D,MultiThreadPoolHighContention)3998 TEST(Parallelize4D, MultiThreadPoolHighContention) {
3999 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4000 
4001 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4002 	ASSERT_TRUE(threadpool.get());
4003 
4004 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4005 		GTEST_SKIP();
4006 	}
4007 
4008 	pthreadpool_parallelize_4d(
4009 		threadpool.get(),
4010 		reinterpret_cast<pthreadpool_task_4d_t>(IncrementSame4D),
4011 		static_cast<void*>(&num_processed_items),
4012 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
4013 		0 /* flags */);
4014 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
4015 }
4016 
WorkImbalance4D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l)4017 static void WorkImbalance4D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l) {
4018 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
4019 	if (i == 0 && j == 0 && k == 0 && l == 0) {
4020 		/* Spin-wait until all items are computed */
4021 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL) {
4022 			std::atomic_thread_fence(std::memory_order_acquire);
4023 		}
4024 	}
4025 }
4026 
TEST(Parallelize4D,MultiThreadPoolWorkStealing)4027 TEST(Parallelize4D, MultiThreadPoolWorkStealing) {
4028 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4029 
4030 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4031 	ASSERT_TRUE(threadpool.get());
4032 
4033 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4034 		GTEST_SKIP();
4035 	}
4036 
4037 	pthreadpool_parallelize_4d(
4038 		threadpool.get(),
4039 		reinterpret_cast<pthreadpool_task_4d_t>(WorkImbalance4D),
4040 		static_cast<void*>(&num_processed_items),
4041 		kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
4042 		0 /* flags */);
4043 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
4044 }
4045 
ComputeNothing4DTile1D(void *,size_t,size_t,size_t,size_t,size_t)4046 static void ComputeNothing4DTile1D(void*, size_t, size_t, size_t, size_t, size_t) {
4047 }
4048 
TEST(Parallelize4DTile1D,SingleThreadPoolCompletes)4049 TEST(Parallelize4DTile1D, SingleThreadPoolCompletes) {
4050 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4051 	ASSERT_TRUE(threadpool.get());
4052 
4053 	pthreadpool_parallelize_4d_tile_1d(threadpool.get(),
4054 		ComputeNothing4DTile1D,
4055 		nullptr,
4056 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4057 		kParallelize4DTile1DTileL,
4058 		0 /* flags */);
4059 }
4060 
TEST(Parallelize4DTile1D,MultiThreadPoolCompletes)4061 TEST(Parallelize4DTile1D, MultiThreadPoolCompletes) {
4062 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4063 	ASSERT_TRUE(threadpool.get());
4064 
4065 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4066 		GTEST_SKIP();
4067 	}
4068 
4069 	pthreadpool_parallelize_4d_tile_1d(
4070 		threadpool.get(),
4071 		ComputeNothing4DTile1D,
4072 		nullptr,
4073 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4074 		kParallelize4DTile1DTileL,
4075 		0 /* flags */);
4076 }
4077 
CheckBounds4DTile1D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4078 static void CheckBounds4DTile1D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4079 	EXPECT_LT(i, kParallelize4DTile1DRangeI);
4080 	EXPECT_LT(j, kParallelize4DTile1DRangeJ);
4081 	EXPECT_LT(k, kParallelize4DTile1DRangeK);
4082 	EXPECT_LT(start_l, kParallelize4DTile1DRangeL);
4083 	EXPECT_LE(start_l + tile_l, kParallelize4DTile1DRangeL);
4084 }
4085 
TEST(Parallelize4DTile1D,SingleThreadPoolAllItemsInBounds)4086 TEST(Parallelize4DTile1D, SingleThreadPoolAllItemsInBounds) {
4087 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4088 	ASSERT_TRUE(threadpool.get());
4089 
4090 	pthreadpool_parallelize_4d_tile_1d(
4091 		threadpool.get(),
4092 		CheckBounds4DTile1D,
4093 		nullptr,
4094 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4095 		kParallelize4DTile1DTileL,
4096 		0 /* flags */);
4097 }
4098 
TEST(Parallelize4DTile1D,MultiThreadPoolAllItemsInBounds)4099 TEST(Parallelize4DTile1D, MultiThreadPoolAllItemsInBounds) {
4100 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4101 	ASSERT_TRUE(threadpool.get());
4102 
4103 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4104 		GTEST_SKIP();
4105 	}
4106 
4107 	pthreadpool_parallelize_4d_tile_1d(
4108 		threadpool.get(),
4109 		CheckBounds4DTile1D,
4110 		nullptr,
4111 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4112 		kParallelize4DTile1DTileL,
4113 		0 /* flags */);
4114 }
4115 
CheckTiling4DTile1D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4116 static void CheckTiling4DTile1D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4117 	EXPECT_GT(tile_l, 0);
4118 	EXPECT_LE(tile_l, kParallelize4DTile1DTileL);
4119 	EXPECT_EQ(start_l % kParallelize4DTile1DTileL, 0);
4120 	EXPECT_EQ(tile_l, std::min<size_t>(kParallelize4DTile1DTileL, kParallelize4DTile1DRangeL - start_l));
4121 }
4122 
TEST(Parallelize4DTile1D,SingleThreadPoolUniformTiling)4123 TEST(Parallelize4DTile1D, SingleThreadPoolUniformTiling) {
4124 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4125 	ASSERT_TRUE(threadpool.get());
4126 
4127 	pthreadpool_parallelize_4d_tile_1d(
4128 		threadpool.get(),
4129 		CheckTiling4DTile1D,
4130 		nullptr,
4131 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4132 		kParallelize4DTile1DTileL,
4133 		0 /* flags */);
4134 }
4135 
TEST(Parallelize4DTile1D,MultiThreadPoolUniformTiling)4136 TEST(Parallelize4DTile1D, MultiThreadPoolUniformTiling) {
4137 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4138 	ASSERT_TRUE(threadpool.get());
4139 
4140 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4141 		GTEST_SKIP();
4142 	}
4143 
4144 	pthreadpool_parallelize_4d_tile_1d(
4145 		threadpool.get(),
4146 		CheckTiling4DTile1D,
4147 		nullptr,
4148 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4149 		kParallelize4DTile1DTileL,
4150 		0 /* flags */);
4151 }
4152 
SetTrue4DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4153 static void SetTrue4DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4154 	for (size_t l = start_l; l < start_l + tile_l; l++) {
4155 		const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4156 		processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
4157 	}
4158 }
4159 
TEST(Parallelize4DTile1D,SingleThreadPoolAllItemsProcessed)4160 TEST(Parallelize4DTile1D, SingleThreadPoolAllItemsProcessed) {
4161 	std::vector<std::atomic_bool> indicators(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4162 
4163 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4164 	ASSERT_TRUE(threadpool.get());
4165 
4166 	pthreadpool_parallelize_4d_tile_1d(
4167 		threadpool.get(),
4168 		reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(SetTrue4DTile1D),
4169 		static_cast<void*>(indicators.data()),
4170 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4171 		kParallelize4DTile1DTileL,
4172 		0 /* flags */);
4173 
4174 	for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4175 		for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4176 			for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4177 				for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4178 					const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4179 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4180 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4181 				}
4182 			}
4183 		}
4184 	}
4185 }
4186 
TEST(Parallelize4DTile1D,MultiThreadPoolAllItemsProcessed)4187 TEST(Parallelize4DTile1D, MultiThreadPoolAllItemsProcessed) {
4188 	std::vector<std::atomic_bool> indicators(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4189 
4190 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4191 	ASSERT_TRUE(threadpool.get());
4192 
4193 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4194 		GTEST_SKIP();
4195 	}
4196 
4197 	pthreadpool_parallelize_4d_tile_1d(
4198 		threadpool.get(),
4199 		reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(SetTrue4DTile1D),
4200 		static_cast<void*>(indicators.data()),
4201 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4202 		kParallelize4DTile1DTileL,
4203 		0 /* flags */);
4204 
4205 	for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4206 		for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4207 			for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4208 				for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4209 					const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4210 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4211 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4212 				}
4213 			}
4214 		}
4215 	}
4216 }
4217 
Increment4DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4218 static void Increment4DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4219 	for (size_t l = start_l; l < start_l + tile_l; l++) {
4220 		const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4221 		processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
4222 	}
4223 }
4224 
TEST(Parallelize4DTile1D,SingleThreadPoolEachItemProcessedOnce)4225 TEST(Parallelize4DTile1D, SingleThreadPoolEachItemProcessedOnce) {
4226 	std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4227 
4228 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4229 	ASSERT_TRUE(threadpool.get());
4230 
4231 	pthreadpool_parallelize_4d_tile_1d(
4232 		threadpool.get(),
4233 		reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4234 		static_cast<void*>(counters.data()),
4235 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4236 		kParallelize4DTile1DTileL,
4237 		0 /* flags */);
4238 
4239 	for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4240 		for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4241 			for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4242 				for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4243 					const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4244 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4245 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4246 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4247 				}
4248 			}
4249 		}
4250 	}
4251 }
4252 
TEST(Parallelize4DTile1D,MultiThreadPoolEachItemProcessedOnce)4253 TEST(Parallelize4DTile1D, MultiThreadPoolEachItemProcessedOnce) {
4254 	std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4255 
4256 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4257 	ASSERT_TRUE(threadpool.get());
4258 
4259 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4260 		GTEST_SKIP();
4261 	}
4262 
4263 	pthreadpool_parallelize_4d_tile_1d(
4264 		threadpool.get(),
4265 		reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4266 		static_cast<void*>(counters.data()),
4267 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4268 		kParallelize4DTile1DTileL,
4269 		0 /* flags */);
4270 
4271 	for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4272 		for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4273 			for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4274 				for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4275 					const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4276 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4277 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4278 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4279 				}
4280 			}
4281 		}
4282 	}
4283 }
4284 
TEST(Parallelize4DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)4285 TEST(Parallelize4DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
4286 	std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4287 
4288 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4289 	ASSERT_TRUE(threadpool.get());
4290 
4291 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4292 		pthreadpool_parallelize_4d_tile_1d(
4293 			threadpool.get(),
4294 			reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4295 			static_cast<void*>(counters.data()),
4296 			kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4297 			kParallelize4DTile1DTileL,
4298 			0 /* flags */);
4299 	}
4300 
4301 	for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4302 		for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4303 			for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4304 				for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4305 					const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4306 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4307 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4308 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
4309 						<< "(expected: " << kIncrementIterations << ")";
4310 				}
4311 			}
4312 		}
4313 	}
4314 }
4315 
TEST(Parallelize4DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)4316 TEST(Parallelize4DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
4317 	std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4318 
4319 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4320 	ASSERT_TRUE(threadpool.get());
4321 
4322 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4323 		GTEST_SKIP();
4324 	}
4325 
4326 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4327 		pthreadpool_parallelize_4d_tile_1d(
4328 			threadpool.get(),
4329 			reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4330 			static_cast<void*>(counters.data()),
4331 			kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4332 			kParallelize4DTile1DTileL,
4333 			0 /* flags */);
4334 	}
4335 
4336 	for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4337 		for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4338 			for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4339 				for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4340 					const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4341 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4342 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4343 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
4344 						<< "(expected: " << kIncrementIterations << ")";
4345 				}
4346 			}
4347 		}
4348 	}
4349 }
4350 
IncrementSame4DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4351 static void IncrementSame4DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4352 	for (size_t l = start_l; l < start_l + tile_l; l++) {
4353 		num_processed_items->fetch_add(1, std::memory_order_relaxed);
4354 	}
4355 }
4356 
TEST(Parallelize4DTile1D,MultiThreadPoolHighContention)4357 TEST(Parallelize4DTile1D, MultiThreadPoolHighContention) {
4358 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4359 
4360 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4361 	ASSERT_TRUE(threadpool.get());
4362 
4363 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4364 		GTEST_SKIP();
4365 	}
4366 
4367 	pthreadpool_parallelize_4d_tile_1d(
4368 		threadpool.get(),
4369 		reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(IncrementSame4DTile1D),
4370 		static_cast<void*>(&num_processed_items),
4371 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4372 		kParallelize4DTile1DTileL,
4373 		0 /* flags */);
4374 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4375 }
4376 
WorkImbalance4DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4377 static void WorkImbalance4DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4378 	num_processed_items->fetch_add(tile_l, std::memory_order_relaxed);
4379 	if (i == 0 && j == 0 && k == 0 && start_l == 0) {
4380 		/* Spin-wait until all items are computed */
4381 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL) {
4382 			std::atomic_thread_fence(std::memory_order_acquire);
4383 		}
4384 	}
4385 }
4386 
TEST(Parallelize4DTile1D,MultiThreadPoolWorkStealing)4387 TEST(Parallelize4DTile1D, MultiThreadPoolWorkStealing) {
4388 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4389 
4390 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4391 	ASSERT_TRUE(threadpool.get());
4392 
4393 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4394 		GTEST_SKIP();
4395 	}
4396 
4397 	pthreadpool_parallelize_4d_tile_1d(
4398 		threadpool.get(),
4399 		reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(WorkImbalance4DTile1D),
4400 		static_cast<void*>(&num_processed_items),
4401 		kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4402 		kParallelize4DTile1DTileL,
4403 		0 /* flags */);
4404 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4405 }
4406 
ComputeNothing4DTile2D(void *,size_t,size_t,size_t,size_t,size_t,size_t)4407 static void ComputeNothing4DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t) {
4408 }
4409 
TEST(Parallelize4DTile2D,SingleThreadPoolCompletes)4410 TEST(Parallelize4DTile2D, SingleThreadPoolCompletes) {
4411 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4412 	ASSERT_TRUE(threadpool.get());
4413 
4414 	pthreadpool_parallelize_4d_tile_2d(threadpool.get(),
4415 		ComputeNothing4DTile2D,
4416 		nullptr,
4417 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4418 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4419 		0 /* flags */);
4420 }
4421 
TEST(Parallelize4DTile2D,MultiThreadPoolCompletes)4422 TEST(Parallelize4DTile2D, MultiThreadPoolCompletes) {
4423 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4424 	ASSERT_TRUE(threadpool.get());
4425 
4426 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4427 		GTEST_SKIP();
4428 	}
4429 
4430 	pthreadpool_parallelize_4d_tile_2d(
4431 		threadpool.get(),
4432 		ComputeNothing4DTile2D,
4433 		nullptr,
4434 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4435 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4436 		0 /* flags */);
4437 }
4438 
CheckBounds4DTile2D(void *,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4439 static void CheckBounds4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4440 	EXPECT_LT(i, kParallelize4DTile2DRangeI);
4441 	EXPECT_LT(j, kParallelize4DTile2DRangeJ);
4442 	EXPECT_LT(start_k, kParallelize4DTile2DRangeK);
4443 	EXPECT_LT(start_l, kParallelize4DTile2DRangeL);
4444 	EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK);
4445 	EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL);
4446 }
4447 
TEST(Parallelize4DTile2D,SingleThreadPoolAllItemsInBounds)4448 TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsInBounds) {
4449 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4450 	ASSERT_TRUE(threadpool.get());
4451 
4452 	pthreadpool_parallelize_4d_tile_2d(
4453 		threadpool.get(),
4454 		CheckBounds4DTile2D,
4455 		nullptr,
4456 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4457 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4458 		0 /* flags */);
4459 }
4460 
TEST(Parallelize4DTile2D,MultiThreadPoolAllItemsInBounds)4461 TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsInBounds) {
4462 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4463 	ASSERT_TRUE(threadpool.get());
4464 
4465 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4466 		GTEST_SKIP();
4467 	}
4468 
4469 	pthreadpool_parallelize_4d_tile_2d(
4470 		threadpool.get(),
4471 		CheckBounds4DTile2D,
4472 		nullptr,
4473 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4474 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4475 		0 /* flags */);
4476 }
4477 
CheckTiling4DTile2D(void *,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4478 static void CheckTiling4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4479 	EXPECT_GT(tile_k, 0);
4480 	EXPECT_LE(tile_k, kParallelize4DTile2DTileK);
4481 	EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0);
4482 	EXPECT_EQ(tile_k, std::min<size_t>(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k));
4483 
4484 	EXPECT_GT(tile_l, 0);
4485 	EXPECT_LE(tile_l, kParallelize4DTile2DTileL);
4486 	EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0);
4487 	EXPECT_EQ(tile_l, std::min<size_t>(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l));
4488 }
4489 
TEST(Parallelize4DTile2D,SingleThreadPoolUniformTiling)4490 TEST(Parallelize4DTile2D, SingleThreadPoolUniformTiling) {
4491 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4492 	ASSERT_TRUE(threadpool.get());
4493 
4494 	pthreadpool_parallelize_4d_tile_2d(
4495 		threadpool.get(),
4496 		CheckTiling4DTile2D,
4497 		nullptr,
4498 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4499 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4500 		0 /* flags */);
4501 }
4502 
TEST(Parallelize4DTile2D,MultiThreadPoolUniformTiling)4503 TEST(Parallelize4DTile2D, MultiThreadPoolUniformTiling) {
4504 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4505 	ASSERT_TRUE(threadpool.get());
4506 
4507 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4508 		GTEST_SKIP();
4509 	}
4510 
4511 	pthreadpool_parallelize_4d_tile_2d(
4512 		threadpool.get(),
4513 		CheckTiling4DTile2D,
4514 		nullptr,
4515 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4516 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4517 		0 /* flags */);
4518 }
4519 
SetTrue4DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4520 static void SetTrue4DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4521 	for (size_t k = start_k; k < start_k + tile_k; k++) {
4522 		for (size_t l = start_l; l < start_l + tile_l; l++) {
4523 			const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4524 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
4525 		}
4526 	}
4527 }
4528 
TEST(Parallelize4DTile2D,SingleThreadPoolAllItemsProcessed)4529 TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsProcessed) {
4530 	std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4531 
4532 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4533 	ASSERT_TRUE(threadpool.get());
4534 
4535 	pthreadpool_parallelize_4d_tile_2d(
4536 		threadpool.get(),
4537 		reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(SetTrue4DTile2D),
4538 		static_cast<void*>(indicators.data()),
4539 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4540 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4541 		0 /* flags */);
4542 
4543 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4544 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4545 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4546 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4547 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4548 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4549 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4550 				}
4551 			}
4552 		}
4553 	}
4554 }
4555 
TEST(Parallelize4DTile2D,MultiThreadPoolAllItemsProcessed)4556 TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsProcessed) {
4557 	std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4558 
4559 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4560 	ASSERT_TRUE(threadpool.get());
4561 
4562 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4563 		GTEST_SKIP();
4564 	}
4565 
4566 	pthreadpool_parallelize_4d_tile_2d(
4567 		threadpool.get(),
4568 		reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(SetTrue4DTile2D),
4569 		static_cast<void*>(indicators.data()),
4570 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4571 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4572 		0 /* flags */);
4573 
4574 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4575 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4576 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4577 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4578 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4579 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4580 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4581 				}
4582 			}
4583 		}
4584 	}
4585 }
4586 
Increment4DTile2D(std::atomic_int * processed_counters,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4587 static void Increment4DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4588 	for (size_t k = start_k; k < start_k + tile_k; k++) {
4589 		for (size_t l = start_l; l < start_l + tile_l; l++) {
4590 			const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4591 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
4592 		}
4593 	}
4594 }
4595 
TEST(Parallelize4DTile2D,SingleThreadPoolEachItemProcessedOnce)4596 TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedOnce) {
4597 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4598 
4599 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4600 	ASSERT_TRUE(threadpool.get());
4601 
4602 	pthreadpool_parallelize_4d_tile_2d(
4603 		threadpool.get(),
4604 		reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4605 		static_cast<void*>(counters.data()),
4606 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4607 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4608 		0 /* flags */);
4609 
4610 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4611 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4612 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4613 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4614 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4615 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4616 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4617 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4618 				}
4619 			}
4620 		}
4621 	}
4622 }
4623 
TEST(Parallelize4DTile2D,MultiThreadPoolEachItemProcessedOnce)4624 TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedOnce) {
4625 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4626 
4627 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4628 	ASSERT_TRUE(threadpool.get());
4629 
4630 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4631 		GTEST_SKIP();
4632 	}
4633 
4634 	pthreadpool_parallelize_4d_tile_2d(
4635 		threadpool.get(),
4636 		reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4637 		static_cast<void*>(counters.data()),
4638 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4639 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4640 		0 /* flags */);
4641 
4642 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4643 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4644 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4645 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4646 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4647 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4648 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4649 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4650 				}
4651 			}
4652 		}
4653 	}
4654 }
4655 
TEST(Parallelize4DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)4656 TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
4657 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4658 
4659 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4660 	ASSERT_TRUE(threadpool.get());
4661 
4662 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4663 		pthreadpool_parallelize_4d_tile_2d(
4664 			threadpool.get(),
4665 			reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4666 			static_cast<void*>(counters.data()),
4667 			kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4668 			kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4669 			0 /* flags */);
4670 	}
4671 
4672 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4673 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4674 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4675 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4676 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4677 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4678 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4679 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
4680 						<< "(expected: " << kIncrementIterations << ")";
4681 				}
4682 			}
4683 		}
4684 	}
4685 }
4686 
TEST(Parallelize4DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)4687 TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
4688 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4689 
4690 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4691 	ASSERT_TRUE(threadpool.get());
4692 
4693 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4694 		GTEST_SKIP();
4695 	}
4696 
4697 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4698 		pthreadpool_parallelize_4d_tile_2d(
4699 			threadpool.get(),
4700 			reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4701 			static_cast<void*>(counters.data()),
4702 			kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4703 			kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4704 			0 /* flags */);
4705 	}
4706 
4707 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4708 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4709 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4710 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4711 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4712 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4713 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4714 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
4715 						<< "(expected: " << kIncrementIterations << ")";
4716 				}
4717 			}
4718 		}
4719 	}
4720 }
4721 
IncrementSame4DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4722 static void IncrementSame4DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4723 	for (size_t k = start_k; k < start_k + tile_k; k++) {
4724 		for (size_t l = start_l; l < start_l + tile_l; l++) {
4725 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
4726 		}
4727 	}
4728 }
4729 
TEST(Parallelize4DTile2D,MultiThreadPoolHighContention)4730 TEST(Parallelize4DTile2D, MultiThreadPoolHighContention) {
4731 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4732 
4733 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4734 	ASSERT_TRUE(threadpool.get());
4735 
4736 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4737 		GTEST_SKIP();
4738 	}
4739 
4740 	pthreadpool_parallelize_4d_tile_2d(
4741 		threadpool.get(),
4742 		reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(IncrementSame4DTile2D),
4743 		static_cast<void*>(&num_processed_items),
4744 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4745 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4746 		0 /* flags */);
4747 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4748 }
4749 
WorkImbalance4DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4750 static void WorkImbalance4DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4751 	num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed);
4752 	if (i == 0 && j == 0 && start_k == 0 && start_l == 0) {
4753 		/* Spin-wait until all items are computed */
4754 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) {
4755 			std::atomic_thread_fence(std::memory_order_acquire);
4756 		}
4757 	}
4758 }
4759 
TEST(Parallelize4DTile2D,MultiThreadPoolWorkStealing)4760 TEST(Parallelize4DTile2D, MultiThreadPoolWorkStealing) {
4761 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4762 
4763 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4764 	ASSERT_TRUE(threadpool.get());
4765 
4766 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4767 		GTEST_SKIP();
4768 	}
4769 
4770 	pthreadpool_parallelize_4d_tile_2d(
4771 		threadpool.get(),
4772 		reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(WorkImbalance4DTile2D),
4773 		static_cast<void*>(&num_processed_items),
4774 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4775 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4776 		0 /* flags */);
4777 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4778 }
4779 
ComputeNothing4DTile2DWithUArch(void *,uint32_t,size_t,size_t,size_t,size_t,size_t,size_t)4780 static void ComputeNothing4DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t) {
4781 }
4782 
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolCompletes)4783 TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolCompletes) {
4784 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4785 	ASSERT_TRUE(threadpool.get());
4786 
4787 	pthreadpool_parallelize_4d_tile_2d_with_uarch(threadpool.get(),
4788 		ComputeNothing4DTile2DWithUArch,
4789 		nullptr,
4790 		kDefaultUArchIndex, kMaxUArchIndex,
4791 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4792 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4793 		0 /* flags */);
4794 }
4795 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolCompletes)4796 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolCompletes) {
4797 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4798 	ASSERT_TRUE(threadpool.get());
4799 
4800 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4801 		GTEST_SKIP();
4802 	}
4803 
4804 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4805 		threadpool.get(),
4806 		ComputeNothing4DTile2DWithUArch,
4807 		nullptr,
4808 		kDefaultUArchIndex, kMaxUArchIndex,
4809 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4810 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4811 		0 /* flags */);
4812 }
4813 
CheckUArch4DTile2DWithUArch(void *,uint32_t uarch_index,size_t,size_t,size_t,size_t,size_t,size_t)4814 static void CheckUArch4DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t, size_t, size_t) {
4815 	if (uarch_index != kDefaultUArchIndex) {
4816 		EXPECT_LE(uarch_index, kMaxUArchIndex);
4817 	}
4818 }
4819 
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolUArchInBounds)4820 TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolUArchInBounds) {
4821 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4822 	ASSERT_TRUE(threadpool.get());
4823 
4824 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4825 		threadpool.get(),
4826 		CheckUArch4DTile2DWithUArch,
4827 		nullptr,
4828 		kDefaultUArchIndex, kMaxUArchIndex,
4829 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4830 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4831 		0 /* flags */);
4832 }
4833 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolUArchInBounds)4834 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolUArchInBounds) {
4835 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4836 	ASSERT_TRUE(threadpool.get());
4837 
4838 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4839 		GTEST_SKIP();
4840 	}
4841 
4842 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4843 		threadpool.get(),
4844 		CheckUArch4DTile2DWithUArch,
4845 		nullptr,
4846 		kDefaultUArchIndex, kMaxUArchIndex,
4847 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4848 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4849 		0 /* flags */);
4850 }
4851 
CheckBounds4DTile2DWithUArch(void *,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4852 static void CheckBounds4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4853 	EXPECT_LT(i, kParallelize4DTile2DRangeI);
4854 	EXPECT_LT(j, kParallelize4DTile2DRangeJ);
4855 	EXPECT_LT(start_k, kParallelize4DTile2DRangeK);
4856 	EXPECT_LT(start_l, kParallelize4DTile2DRangeL);
4857 	EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK);
4858 	EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL);
4859 }
4860 
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolAllItemsInBounds)4861 TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) {
4862 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4863 	ASSERT_TRUE(threadpool.get());
4864 
4865 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4866 		threadpool.get(),
4867 		CheckBounds4DTile2DWithUArch,
4868 		nullptr,
4869 		kDefaultUArchIndex, kMaxUArchIndex,
4870 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4871 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4872 		0 /* flags */);
4873 }
4874 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolAllItemsInBounds)4875 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) {
4876 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4877 	ASSERT_TRUE(threadpool.get());
4878 
4879 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4880 		GTEST_SKIP();
4881 	}
4882 
4883 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4884 		threadpool.get(),
4885 		CheckBounds4DTile2DWithUArch,
4886 		nullptr,
4887 		kDefaultUArchIndex, kMaxUArchIndex,
4888 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4889 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4890 		0 /* flags */);
4891 }
4892 
CheckTiling4DTile2DWithUArch(void *,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4893 static void CheckTiling4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4894 	EXPECT_GT(tile_k, 0);
4895 	EXPECT_LE(tile_k, kParallelize4DTile2DTileK);
4896 	EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0);
4897 	EXPECT_EQ(tile_k, std::min<size_t>(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k));
4898 
4899 	EXPECT_GT(tile_l, 0);
4900 	EXPECT_LE(tile_l, kParallelize4DTile2DTileL);
4901 	EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0);
4902 	EXPECT_EQ(tile_l, std::min<size_t>(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l));
4903 }
4904 
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolUniformTiling)4905 TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolUniformTiling) {
4906 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4907 	ASSERT_TRUE(threadpool.get());
4908 
4909 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4910 		threadpool.get(),
4911 		CheckTiling4DTile2DWithUArch,
4912 		nullptr,
4913 		kDefaultUArchIndex, kMaxUArchIndex,
4914 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4915 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4916 		0 /* flags */);
4917 }
4918 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolUniformTiling)4919 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolUniformTiling) {
4920 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4921 	ASSERT_TRUE(threadpool.get());
4922 
4923 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4924 		GTEST_SKIP();
4925 	}
4926 
4927 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4928 		threadpool.get(),
4929 		CheckTiling4DTile2DWithUArch,
4930 		nullptr,
4931 		kDefaultUArchIndex, kMaxUArchIndex,
4932 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4933 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4934 		0 /* flags */);
4935 }
4936 
SetTrue4DTile2DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4937 static void SetTrue4DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4938 	for (size_t k = start_k; k < start_k + tile_k; k++) {
4939 		for (size_t l = start_l; l < start_l + tile_l; l++) {
4940 			const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4941 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
4942 		}
4943 	}
4944 }
4945 
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolAllItemsProcessed)4946 TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) {
4947 	std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4948 
4949 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4950 	ASSERT_TRUE(threadpool.get());
4951 
4952 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4953 		threadpool.get(),
4954 		reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(SetTrue4DTile2DWithUArch),
4955 		static_cast<void*>(indicators.data()),
4956 		kDefaultUArchIndex, kMaxUArchIndex,
4957 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4958 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4959 		0 /* flags */);
4960 
4961 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4962 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4963 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4964 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4965 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4966 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4967 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4968 				}
4969 			}
4970 		}
4971 	}
4972 }
4973 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolAllItemsProcessed)4974 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) {
4975 	std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4976 
4977 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4978 	ASSERT_TRUE(threadpool.get());
4979 
4980 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4981 		GTEST_SKIP();
4982 	}
4983 
4984 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
4985 		threadpool.get(),
4986 		reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(SetTrue4DTile2DWithUArch),
4987 		static_cast<void*>(indicators.data()),
4988 		kDefaultUArchIndex, kMaxUArchIndex,
4989 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4990 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4991 		0 /* flags */);
4992 
4993 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4994 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4995 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4996 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4997 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4998 					EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4999 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
5000 				}
5001 			}
5002 		}
5003 	}
5004 }
5005 
Increment4DTile2DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)5006 static void Increment4DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
5007 	for (size_t k = start_k; k < start_k + tile_k; k++) {
5008 		for (size_t l = start_l; l < start_l + tile_l; l++) {
5009 			const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5010 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
5011 		}
5012 	}
5013 }
5014 
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolEachItemProcessedOnce)5015 TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
5016 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5017 
5018 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5019 	ASSERT_TRUE(threadpool.get());
5020 
5021 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
5022 		threadpool.get(),
5023 		reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5024 		static_cast<void*>(counters.data()),
5025 		kDefaultUArchIndex, kMaxUArchIndex,
5026 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5027 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5028 		0 /* flags */);
5029 
5030 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5031 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5032 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5033 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5034 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5035 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5036 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5037 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5038 				}
5039 			}
5040 		}
5041 	}
5042 }
5043 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolEachItemProcessedOnce)5044 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
5045 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5046 
5047 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5048 	ASSERT_TRUE(threadpool.get());
5049 
5050 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5051 		GTEST_SKIP();
5052 	}
5053 
5054 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
5055 		threadpool.get(),
5056 		reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5057 		static_cast<void*>(counters.data()),
5058 		kDefaultUArchIndex, kMaxUArchIndex,
5059 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5060 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5061 		0 /* flags */);
5062 
5063 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5064 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5065 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5066 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5067 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5068 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5069 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5070 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5071 				}
5072 			}
5073 		}
5074 	}
5075 }
5076 
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)5077 TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
5078 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5079 
5080 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5081 	ASSERT_TRUE(threadpool.get());
5082 
5083 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
5084 		pthreadpool_parallelize_4d_tile_2d_with_uarch(
5085 			threadpool.get(),
5086 			reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5087 			static_cast<void*>(counters.data()),
5088 			kDefaultUArchIndex, kMaxUArchIndex,
5089 			kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5090 			kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5091 			0 /* flags */);
5092 	}
5093 
5094 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5095 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5096 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5097 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5098 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5099 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
5100 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5101 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
5102 						<< "(expected: " << kIncrementIterations << ")";
5103 				}
5104 			}
5105 		}
5106 	}
5107 }
5108 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)5109 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
5110 	std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5111 
5112 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5113 	ASSERT_TRUE(threadpool.get());
5114 
5115 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5116 		GTEST_SKIP();
5117 	}
5118 
5119 	for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
5120 		pthreadpool_parallelize_4d_tile_2d_with_uarch(
5121 			threadpool.get(),
5122 			reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5123 			static_cast<void*>(counters.data()),
5124 			kDefaultUArchIndex, kMaxUArchIndex,
5125 			kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5126 			kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5127 			0 /* flags */);
5128 	}
5129 
5130 	for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5131 		for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5132 			for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5133 				for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5134 					const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5135 					EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
5136 						<< "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5137 						<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
5138 						<< "(expected: " << kIncrementIterations << ")";
5139 				}
5140 			}
5141 		}
5142 	}
5143 }
5144 
IncrementSame4DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)5145 static void IncrementSame4DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
5146 	for (size_t k = start_k; k < start_k + tile_k; k++) {
5147 		for (size_t l = start_l; l < start_l + tile_l; l++) {
5148 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
5149 		}
5150 	}
5151 }
5152 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolHighContention)5153 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolHighContention) {
5154 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5155 
5156 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5157 	ASSERT_TRUE(threadpool.get());
5158 
5159 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5160 		GTEST_SKIP();
5161 	}
5162 
5163 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
5164 		threadpool.get(),
5165 		reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(IncrementSame4DTile2DWithUArch),
5166 		static_cast<void*>(&num_processed_items),
5167 		kDefaultUArchIndex, kMaxUArchIndex,
5168 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5169 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5170 		0 /* flags */);
5171 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5172 }
5173 
WorkImbalance4DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)5174 static void WorkImbalance4DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
5175 	num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed);
5176 	if (i == 0 && j == 0 && start_k == 0 && start_l == 0) {
5177 		/* Spin-wait until all items are computed */
5178 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) {
5179 			std::atomic_thread_fence(std::memory_order_acquire);
5180 		}
5181 	}
5182 }
5183 
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolWorkStealing)5184 TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolWorkStealing) {
5185 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5186 
5187 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5188 	ASSERT_TRUE(threadpool.get());
5189 
5190 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5191 		GTEST_SKIP();
5192 	}
5193 
5194 	pthreadpool_parallelize_4d_tile_2d_with_uarch(
5195 		threadpool.get(),
5196 		reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(WorkImbalance4DTile2DWithUArch),
5197 		static_cast<void*>(&num_processed_items),
5198 		kDefaultUArchIndex, kMaxUArchIndex,
5199 		kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5200 		kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5201 		0 /* flags */);
5202 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5203 }
5204 
ComputeNothing5D(void *,size_t,size_t,size_t,size_t,size_t)5205 static void ComputeNothing5D(void*, size_t, size_t, size_t, size_t, size_t) {
5206 }
5207 
TEST(Parallelize5D,SingleThreadPoolCompletes)5208 TEST(Parallelize5D, SingleThreadPoolCompletes) {
5209 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5210 	ASSERT_TRUE(threadpool.get());
5211 
5212 	pthreadpool_parallelize_5d(threadpool.get(),
5213 		ComputeNothing5D,
5214 		nullptr,
5215 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5216 		0 /* flags */);
5217 }
5218 
TEST(Parallelize5D,MultiThreadPoolCompletes)5219 TEST(Parallelize5D, MultiThreadPoolCompletes) {
5220 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5221 	ASSERT_TRUE(threadpool.get());
5222 
5223 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5224 		GTEST_SKIP();
5225 	}
5226 
5227 	pthreadpool_parallelize_5d(
5228 		threadpool.get(),
5229 		ComputeNothing5D,
5230 		nullptr,
5231 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5232 		0 /* flags */);
5233 }
5234 
CheckBounds5D(void *,size_t i,size_t j,size_t k,size_t l,size_t m)5235 static void CheckBounds5D(void*, size_t i, size_t j, size_t k, size_t l, size_t m) {
5236 	EXPECT_LT(i, kParallelize5DRangeI);
5237 	EXPECT_LT(j, kParallelize5DRangeJ);
5238 	EXPECT_LT(k, kParallelize5DRangeK);
5239 	EXPECT_LT(l, kParallelize5DRangeL);
5240 	EXPECT_LT(m, kParallelize5DRangeM);
5241 }
5242 
TEST(Parallelize5D,SingleThreadPoolAllItemsInBounds)5243 TEST(Parallelize5D, SingleThreadPoolAllItemsInBounds) {
5244 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5245 	ASSERT_TRUE(threadpool.get());
5246 
5247 	pthreadpool_parallelize_5d(
5248 		threadpool.get(),
5249 		CheckBounds5D,
5250 		nullptr,
5251 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5252 		0 /* flags */);
5253 }
5254 
TEST(Parallelize5D,MultiThreadPoolAllItemsInBounds)5255 TEST(Parallelize5D, MultiThreadPoolAllItemsInBounds) {
5256 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5257 	ASSERT_TRUE(threadpool.get());
5258 
5259 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5260 		GTEST_SKIP();
5261 	}
5262 
5263 	pthreadpool_parallelize_5d(
5264 		threadpool.get(),
5265 		CheckBounds5D,
5266 		nullptr,
5267 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5268 		0 /* flags */);
5269 }
5270 
SetTrue5D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t m)5271 static void SetTrue5D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m) {
5272 	const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5273 	processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
5274 }
5275 
TEST(Parallelize5D,SingleThreadPoolAllItemsProcessed)5276 TEST(Parallelize5D, SingleThreadPoolAllItemsProcessed) {
5277 	std::vector<std::atomic_bool> indicators(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5278 
5279 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5280 	ASSERT_TRUE(threadpool.get());
5281 
5282 	pthreadpool_parallelize_5d(
5283 		threadpool.get(),
5284 		reinterpret_cast<pthreadpool_task_5d_t>(SetTrue5D),
5285 		static_cast<void*>(indicators.data()),
5286 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5287 		0 /* flags */);
5288 
5289 	for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5290 		for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5291 			for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5292 				for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5293 					for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5294 						const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5295 						EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5296 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5297 					}
5298 				}
5299 			}
5300 		}
5301 	}
5302 }
5303 
TEST(Parallelize5D,MultiThreadPoolAllItemsProcessed)5304 TEST(Parallelize5D, MultiThreadPoolAllItemsProcessed) {
5305 	std::vector<std::atomic_bool> indicators(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5306 
5307 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5308 	ASSERT_TRUE(threadpool.get());
5309 
5310 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5311 		GTEST_SKIP();
5312 	}
5313 
5314 	pthreadpool_parallelize_5d(
5315 		threadpool.get(),
5316 		reinterpret_cast<pthreadpool_task_5d_t>(SetTrue5D),
5317 		static_cast<void*>(indicators.data()),
5318 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5319 		0 /* flags */);
5320 
5321 	for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5322 		for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5323 			for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5324 				for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5325 					for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5326 						const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5327 						EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5328 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5329 					}
5330 				}
5331 			}
5332 		}
5333 	}
5334 }
5335 
Increment5D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t m)5336 static void Increment5D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m) {
5337 	const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5338 	processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
5339 }
5340 
TEST(Parallelize5D,SingleThreadPoolEachItemProcessedOnce)5341 TEST(Parallelize5D, SingleThreadPoolEachItemProcessedOnce) {
5342 	std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5343 
5344 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5345 	ASSERT_TRUE(threadpool.get());
5346 
5347 	pthreadpool_parallelize_5d(
5348 		threadpool.get(),
5349 		reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5350 		static_cast<void*>(counters.data()),
5351 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5352 		0 /* flags */);
5353 
5354 	for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5355 		for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5356 			for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5357 				for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5358 					for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5359 						const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5360 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5361 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5362 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5363 					}
5364 				}
5365 			}
5366 		}
5367 	}
5368 }
5369 
TEST(Parallelize5D,MultiThreadPoolEachItemProcessedOnce)5370 TEST(Parallelize5D, MultiThreadPoolEachItemProcessedOnce) {
5371 	std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5372 
5373 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5374 	ASSERT_TRUE(threadpool.get());
5375 
5376 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5377 		GTEST_SKIP();
5378 	}
5379 
5380 	pthreadpool_parallelize_5d(
5381 		threadpool.get(),
5382 		reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5383 		static_cast<void*>(counters.data()),
5384 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5385 		0 /* flags */);
5386 
5387 	for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5388 		for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5389 			for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5390 				for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5391 					for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5392 						const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5393 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5394 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5395 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5396 					}
5397 				}
5398 			}
5399 		}
5400 	}
5401 }
5402 
TEST(Parallelize5D,SingleThreadPoolEachItemProcessedMultipleTimes)5403 TEST(Parallelize5D, SingleThreadPoolEachItemProcessedMultipleTimes) {
5404 	std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5405 
5406 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5407 	ASSERT_TRUE(threadpool.get());
5408 
5409 	for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5410 		pthreadpool_parallelize_5d(
5411 			threadpool.get(),
5412 			reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5413 			static_cast<void*>(counters.data()),
5414 			kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5415 				0 /* flags */);
5416 	}
5417 
5418 	for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5419 		for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5420 			for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5421 				for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5422 					for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5423 						const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5424 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5425 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5426 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
5427 							<< "(expected: " << kIncrementIterations5D << ")";
5428 					}
5429 				}
5430 			}
5431 		}
5432 	}
5433 }
5434 
TEST(Parallelize5D,MultiThreadPoolEachItemProcessedMultipleTimes)5435 TEST(Parallelize5D, MultiThreadPoolEachItemProcessedMultipleTimes) {
5436 	std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5437 
5438 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5439 	ASSERT_TRUE(threadpool.get());
5440 
5441 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5442 		GTEST_SKIP();
5443 	}
5444 
5445 	for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5446 		pthreadpool_parallelize_5d(
5447 			threadpool.get(),
5448 			reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5449 			static_cast<void*>(counters.data()),
5450 			kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5451 				0 /* flags */);
5452 	}
5453 
5454 	for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5455 		for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5456 			for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5457 				for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5458 					for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5459 						const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5460 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5461 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5462 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
5463 							<< "(expected: " << kIncrementIterations5D << ")";
5464 					}
5465 				}
5466 			}
5467 		}
5468 	}
5469 }
5470 
IncrementSame5D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m)5471 static void IncrementSame5D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m) {
5472 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
5473 }
5474 
TEST(Parallelize5D,MultiThreadPoolHighContention)5475 TEST(Parallelize5D, MultiThreadPoolHighContention) {
5476 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5477 
5478 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5479 	ASSERT_TRUE(threadpool.get());
5480 
5481 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5482 		GTEST_SKIP();
5483 	}
5484 
5485 	pthreadpool_parallelize_5d(
5486 		threadpool.get(),
5487 		reinterpret_cast<pthreadpool_task_5d_t>(IncrementSame5D),
5488 		static_cast<void*>(&num_processed_items),
5489 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5490 		0 /* flags */);
5491 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5492 }
5493 
WorkImbalance5D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m)5494 static void WorkImbalance5D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m) {
5495 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
5496 	if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0) {
5497 		/* Spin-wait until all items are computed */
5498 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM) {
5499 			std::atomic_thread_fence(std::memory_order_acquire);
5500 		}
5501 	}
5502 }
5503 
TEST(Parallelize5D,MultiThreadPoolWorkStealing)5504 TEST(Parallelize5D, MultiThreadPoolWorkStealing) {
5505 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5506 
5507 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5508 	ASSERT_TRUE(threadpool.get());
5509 
5510 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5511 		GTEST_SKIP();
5512 	}
5513 
5514 	pthreadpool_parallelize_5d(
5515 		threadpool.get(),
5516 		reinterpret_cast<pthreadpool_task_5d_t>(WorkImbalance5D),
5517 		static_cast<void*>(&num_processed_items),
5518 		kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5519 		0 /* flags */);
5520 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5521 }
5522 
ComputeNothing5DTile1D(void *,size_t,size_t,size_t,size_t,size_t,size_t)5523 static void ComputeNothing5DTile1D(void*, size_t, size_t, size_t, size_t, size_t, size_t) {
5524 }
5525 
TEST(Parallelize5DTile1D,SingleThreadPoolCompletes)5526 TEST(Parallelize5DTile1D, SingleThreadPoolCompletes) {
5527 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5528 	ASSERT_TRUE(threadpool.get());
5529 
5530 	pthreadpool_parallelize_5d_tile_1d(threadpool.get(),
5531 		ComputeNothing5DTile1D,
5532 		nullptr,
5533 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5534 		kParallelize5DTile1DTileM,
5535 		0 /* flags */);
5536 }
5537 
TEST(Parallelize5DTile1D,MultiThreadPoolCompletes)5538 TEST(Parallelize5DTile1D, MultiThreadPoolCompletes) {
5539 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5540 	ASSERT_TRUE(threadpool.get());
5541 
5542 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5543 		GTEST_SKIP();
5544 	}
5545 
5546 	pthreadpool_parallelize_5d_tile_1d(
5547 		threadpool.get(),
5548 		ComputeNothing5DTile1D,
5549 		nullptr,
5550 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5551 		kParallelize5DTile1DTileM,
5552 		0 /* flags */);
5553 }
5554 
CheckBounds5DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5555 static void CheckBounds5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5556 	EXPECT_LT(i, kParallelize5DTile1DRangeI);
5557 	EXPECT_LT(j, kParallelize5DTile1DRangeJ);
5558 	EXPECT_LT(k, kParallelize5DTile1DRangeK);
5559 	EXPECT_LT(l, kParallelize5DTile1DRangeL);
5560 	EXPECT_LT(start_m, kParallelize5DTile1DRangeM);
5561 	EXPECT_LE(start_m + tile_m, kParallelize5DTile1DRangeM);
5562 }
5563 
TEST(Parallelize5DTile1D,SingleThreadPoolAllItemsInBounds)5564 TEST(Parallelize5DTile1D, SingleThreadPoolAllItemsInBounds) {
5565 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5566 	ASSERT_TRUE(threadpool.get());
5567 
5568 	pthreadpool_parallelize_5d_tile_1d(
5569 		threadpool.get(),
5570 		CheckBounds5DTile1D,
5571 		nullptr,
5572 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5573 		kParallelize5DTile1DTileM,
5574 		0 /* flags */);
5575 }
5576 
TEST(Parallelize5DTile1D,MultiThreadPoolAllItemsInBounds)5577 TEST(Parallelize5DTile1D, MultiThreadPoolAllItemsInBounds) {
5578 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5579 	ASSERT_TRUE(threadpool.get());
5580 
5581 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5582 		GTEST_SKIP();
5583 	}
5584 
5585 	pthreadpool_parallelize_5d_tile_1d(
5586 		threadpool.get(),
5587 		CheckBounds5DTile1D,
5588 		nullptr,
5589 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5590 		kParallelize5DTile1DTileM,
5591 		0 /* flags */);
5592 }
5593 
CheckTiling5DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5594 static void CheckTiling5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5595 	EXPECT_GT(tile_m, 0);
5596 	EXPECT_LE(tile_m, kParallelize5DTile1DTileM);
5597 	EXPECT_EQ(start_m % kParallelize5DTile1DTileM, 0);
5598 	EXPECT_EQ(tile_m, std::min<size_t>(kParallelize5DTile1DTileM, kParallelize5DTile1DRangeM - start_m));
5599 }
5600 
TEST(Parallelize5DTile1D,SingleThreadPoolUniformTiling)5601 TEST(Parallelize5DTile1D, SingleThreadPoolUniformTiling) {
5602 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5603 	ASSERT_TRUE(threadpool.get());
5604 
5605 	pthreadpool_parallelize_5d_tile_1d(
5606 		threadpool.get(),
5607 		CheckTiling5DTile1D,
5608 		nullptr,
5609 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5610 		kParallelize5DTile1DTileM,
5611 		0 /* flags */);
5612 }
5613 
TEST(Parallelize5DTile1D,MultiThreadPoolUniformTiling)5614 TEST(Parallelize5DTile1D, MultiThreadPoolUniformTiling) {
5615 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5616 	ASSERT_TRUE(threadpool.get());
5617 
5618 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5619 		GTEST_SKIP();
5620 	}
5621 
5622 	pthreadpool_parallelize_5d_tile_1d(
5623 		threadpool.get(),
5624 		CheckTiling5DTile1D,
5625 		nullptr,
5626 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5627 		kParallelize5DTile1DTileM,
5628 		0 /* flags */);
5629 }
5630 
SetTrue5DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5631 static void SetTrue5DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5632 	for (size_t m = start_m; m < start_m + tile_m; m++) {
5633 		const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5634 		processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
5635 	}
5636 }
5637 
TEST(Parallelize5DTile1D,SingleThreadPoolAllItemsProcessed)5638 TEST(Parallelize5DTile1D, SingleThreadPoolAllItemsProcessed) {
5639 	std::vector<std::atomic_bool> indicators(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5640 
5641 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5642 	ASSERT_TRUE(threadpool.get());
5643 
5644 	pthreadpool_parallelize_5d_tile_1d(
5645 		threadpool.get(),
5646 		reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(SetTrue5DTile1D),
5647 		static_cast<void*>(indicators.data()),
5648 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5649 		kParallelize5DTile1DTileM,
5650 		0 /* flags */);
5651 
5652 	for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5653 		for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5654 			for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5655 				for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5656 					for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5657 						const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5658 						EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5659 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5660 					}
5661 				}
5662 			}
5663 		}
5664 	}
5665 }
5666 
TEST(Parallelize5DTile1D,MultiThreadPoolAllItemsProcessed)5667 TEST(Parallelize5DTile1D, MultiThreadPoolAllItemsProcessed) {
5668 	std::vector<std::atomic_bool> indicators(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5669 
5670 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5671 	ASSERT_TRUE(threadpool.get());
5672 
5673 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5674 		GTEST_SKIP();
5675 	}
5676 
5677 	pthreadpool_parallelize_5d_tile_1d(
5678 		threadpool.get(),
5679 		reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(SetTrue5DTile1D),
5680 		static_cast<void*>(indicators.data()),
5681 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5682 		kParallelize5DTile1DTileM,
5683 		0 /* flags */);
5684 
5685 	for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5686 		for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5687 			for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5688 				for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5689 					for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5690 						const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5691 						EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5692 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5693 					}
5694 				}
5695 			}
5696 		}
5697 	}
5698 }
5699 
Increment5DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5700 static void Increment5DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5701 	for (size_t m = start_m; m < start_m + tile_m; m++) {
5702 		const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5703 		processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
5704 	}
5705 }
5706 
TEST(Parallelize5DTile1D,SingleThreadPoolEachItemProcessedOnce)5707 TEST(Parallelize5DTile1D, SingleThreadPoolEachItemProcessedOnce) {
5708 	std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5709 
5710 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5711 	ASSERT_TRUE(threadpool.get());
5712 
5713 	pthreadpool_parallelize_5d_tile_1d(
5714 		threadpool.get(),
5715 		reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5716 		static_cast<void*>(counters.data()),
5717 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5718 		kParallelize5DTile1DTileM,
5719 		0 /* flags */);
5720 
5721 	for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5722 		for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5723 			for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5724 				for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5725 					for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5726 						const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5727 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5728 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5729 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5730 					}
5731 				}
5732 			}
5733 		}
5734 	}
5735 }
5736 
TEST(Parallelize5DTile1D,MultiThreadPoolEachItemProcessedOnce)5737 TEST(Parallelize5DTile1D, MultiThreadPoolEachItemProcessedOnce) {
5738 	std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5739 
5740 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5741 	ASSERT_TRUE(threadpool.get());
5742 
5743 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5744 		GTEST_SKIP();
5745 	}
5746 
5747 	pthreadpool_parallelize_5d_tile_1d(
5748 		threadpool.get(),
5749 		reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5750 		static_cast<void*>(counters.data()),
5751 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5752 		kParallelize5DTile1DTileM,
5753 		0 /* flags */);
5754 
5755 	for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5756 		for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5757 			for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5758 				for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5759 					for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5760 						const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5761 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5762 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5763 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5764 					}
5765 				}
5766 			}
5767 		}
5768 	}
5769 }
5770 
TEST(Parallelize5DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)5771 TEST(Parallelize5DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
5772 	std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5773 
5774 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5775 	ASSERT_TRUE(threadpool.get());
5776 
5777 	for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5778 		pthreadpool_parallelize_5d_tile_1d(
5779 			threadpool.get(),
5780 			reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5781 			static_cast<void*>(counters.data()),
5782 			kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5783 			kParallelize5DTile1DTileM,
5784 			0 /* flags */);
5785 	}
5786 
5787 	for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5788 		for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5789 			for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5790 				for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5791 					for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5792 						const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5793 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5794 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5795 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
5796 							<< "(expected: " << kIncrementIterations5D << ")";
5797 					}
5798 				}
5799 			}
5800 		}
5801 	}
5802 }
5803 
TEST(Parallelize5DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)5804 TEST(Parallelize5DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
5805 	std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5806 
5807 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5808 	ASSERT_TRUE(threadpool.get());
5809 
5810 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5811 		GTEST_SKIP();
5812 	}
5813 
5814 	for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5815 		pthreadpool_parallelize_5d_tile_1d(
5816 			threadpool.get(),
5817 			reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5818 			static_cast<void*>(counters.data()),
5819 			kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5820 			kParallelize5DTile1DTileM,
5821 			0 /* flags */);
5822 	}
5823 
5824 	for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5825 		for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5826 			for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5827 				for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5828 					for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5829 						const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5830 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5831 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5832 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
5833 							<< "(expected: " << kIncrementIterations5D << ")";
5834 					}
5835 				}
5836 			}
5837 		}
5838 	}
5839 }
5840 
IncrementSame5DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5841 static void IncrementSame5DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5842 	for (size_t m = start_m; m < start_m + tile_m; m++) {
5843 		num_processed_items->fetch_add(1, std::memory_order_relaxed);
5844 	}
5845 }
5846 
TEST(Parallelize5DTile1D,MultiThreadPoolHighContention)5847 TEST(Parallelize5DTile1D, MultiThreadPoolHighContention) {
5848 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5849 
5850 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5851 	ASSERT_TRUE(threadpool.get());
5852 
5853 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5854 		GTEST_SKIP();
5855 	}
5856 
5857 	pthreadpool_parallelize_5d_tile_1d(
5858 		threadpool.get(),
5859 		reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(IncrementSame5DTile1D),
5860 		static_cast<void*>(&num_processed_items),
5861 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5862 		kParallelize5DTile1DTileM,
5863 		0 /* flags */);
5864 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5865 }
5866 
WorkImbalance5DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5867 static void WorkImbalance5DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5868 	num_processed_items->fetch_add(tile_m, std::memory_order_relaxed);
5869 	if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0) {
5870 		/* Spin-wait until all items are computed */
5871 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM) {
5872 			std::atomic_thread_fence(std::memory_order_acquire);
5873 		}
5874 	}
5875 }
5876 
TEST(Parallelize5DTile1D,MultiThreadPoolWorkStealing)5877 TEST(Parallelize5DTile1D, MultiThreadPoolWorkStealing) {
5878 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5879 
5880 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5881 	ASSERT_TRUE(threadpool.get());
5882 
5883 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5884 		GTEST_SKIP();
5885 	}
5886 
5887 	pthreadpool_parallelize_5d_tile_1d(
5888 		threadpool.get(),
5889 		reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(WorkImbalance5DTile1D),
5890 		static_cast<void*>(&num_processed_items),
5891 		kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5892 		kParallelize5DTile1DTileM,
5893 		0 /* flags */);
5894 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5895 }
5896 
ComputeNothing5DTile2D(void *,size_t,size_t,size_t,size_t,size_t,size_t,size_t)5897 static void ComputeNothing5DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) {
5898 }
5899 
TEST(Parallelize5DTile2D,SingleThreadPoolCompletes)5900 TEST(Parallelize5DTile2D, SingleThreadPoolCompletes) {
5901 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5902 	ASSERT_TRUE(threadpool.get());
5903 
5904 	pthreadpool_parallelize_5d_tile_2d(threadpool.get(),
5905 		ComputeNothing5DTile2D,
5906 		nullptr,
5907 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5908 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5909 		0 /* flags */);
5910 }
5911 
TEST(Parallelize5DTile2D,MultiThreadPoolCompletes)5912 TEST(Parallelize5DTile2D, MultiThreadPoolCompletes) {
5913 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5914 	ASSERT_TRUE(threadpool.get());
5915 
5916 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5917 		GTEST_SKIP();
5918 	}
5919 
5920 	pthreadpool_parallelize_5d_tile_2d(
5921 		threadpool.get(),
5922 		ComputeNothing5DTile2D,
5923 		nullptr,
5924 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5925 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5926 		0 /* flags */);
5927 }
5928 
CheckBounds5DTile2D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)5929 static void CheckBounds5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
5930 	EXPECT_LT(i, kParallelize5DTile2DRangeI);
5931 	EXPECT_LT(j, kParallelize5DTile2DRangeJ);
5932 	EXPECT_LT(k, kParallelize5DTile2DRangeK);
5933 	EXPECT_LT(start_l, kParallelize5DTile2DRangeL);
5934 	EXPECT_LT(start_m, kParallelize5DTile2DRangeM);
5935 	EXPECT_LE(start_l + tile_l, kParallelize5DTile2DRangeL);
5936 	EXPECT_LE(start_m + tile_m, kParallelize5DTile2DRangeM);
5937 }
5938 
TEST(Parallelize5DTile2D,SingleThreadPoolAllItemsInBounds)5939 TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsInBounds) {
5940 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5941 	ASSERT_TRUE(threadpool.get());
5942 
5943 	pthreadpool_parallelize_5d_tile_2d(
5944 		threadpool.get(),
5945 		CheckBounds5DTile2D,
5946 		nullptr,
5947 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5948 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5949 		0 /* flags */);
5950 }
5951 
TEST(Parallelize5DTile2D,MultiThreadPoolAllItemsInBounds)5952 TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsInBounds) {
5953 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5954 	ASSERT_TRUE(threadpool.get());
5955 
5956 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5957 		GTEST_SKIP();
5958 	}
5959 
5960 	pthreadpool_parallelize_5d_tile_2d(
5961 		threadpool.get(),
5962 		CheckBounds5DTile2D,
5963 		nullptr,
5964 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5965 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5966 		0 /* flags */);
5967 }
5968 
CheckTiling5DTile2D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)5969 static void CheckTiling5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
5970 	EXPECT_GT(tile_l, 0);
5971 	EXPECT_LE(tile_l, kParallelize5DTile2DTileL);
5972 	EXPECT_EQ(start_l % kParallelize5DTile2DTileL, 0);
5973 	EXPECT_EQ(tile_l, std::min<size_t>(kParallelize5DTile2DTileL, kParallelize5DTile2DRangeL - start_l));
5974 
5975 	EXPECT_GT(tile_m, 0);
5976 	EXPECT_LE(tile_m, kParallelize5DTile2DTileM);
5977 	EXPECT_EQ(start_m % kParallelize5DTile2DTileM, 0);
5978 	EXPECT_EQ(tile_m, std::min<size_t>(kParallelize5DTile2DTileM, kParallelize5DTile2DRangeM - start_m));
5979 }
5980 
TEST(Parallelize5DTile2D,SingleThreadPoolUniformTiling)5981 TEST(Parallelize5DTile2D, SingleThreadPoolUniformTiling) {
5982 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5983 	ASSERT_TRUE(threadpool.get());
5984 
5985 	pthreadpool_parallelize_5d_tile_2d(
5986 		threadpool.get(),
5987 		CheckTiling5DTile2D,
5988 		nullptr,
5989 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5990 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5991 		0 /* flags */);
5992 }
5993 
TEST(Parallelize5DTile2D,MultiThreadPoolUniformTiling)5994 TEST(Parallelize5DTile2D, MultiThreadPoolUniformTiling) {
5995 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5996 	ASSERT_TRUE(threadpool.get());
5997 
5998 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5999 		GTEST_SKIP();
6000 	}
6001 
6002 	pthreadpool_parallelize_5d_tile_2d(
6003 		threadpool.get(),
6004 		CheckTiling5DTile2D,
6005 		nullptr,
6006 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6007 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6008 		0 /* flags */);
6009 }
6010 
SetTrue5DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6011 static void SetTrue5DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6012 	for (size_t l = start_l; l < start_l + tile_l; l++) {
6013 		for (size_t m = start_m; m < start_m + tile_m; m++) {
6014 			const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6015 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
6016 		}
6017 	}
6018 }
6019 
TEST(Parallelize5DTile2D,SingleThreadPoolAllItemsProcessed)6020 TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsProcessed) {
6021 	std::vector<std::atomic_bool> indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6022 
6023 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6024 	ASSERT_TRUE(threadpool.get());
6025 
6026 	pthreadpool_parallelize_5d_tile_2d(
6027 		threadpool.get(),
6028 		reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(SetTrue5DTile2D),
6029 		static_cast<void*>(indicators.data()),
6030 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6031 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6032 		0 /* flags */);
6033 
6034 	for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6035 		for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6036 			for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6037 				for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6038 					for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6039 						const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6040 						EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6041 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
6042 					}
6043 				}
6044 			}
6045 		}
6046 	}
6047 }
6048 
TEST(Parallelize5DTile2D,MultiThreadPoolAllItemsProcessed)6049 TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsProcessed) {
6050 	std::vector<std::atomic_bool> indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6051 
6052 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6053 	ASSERT_TRUE(threadpool.get());
6054 
6055 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6056 		GTEST_SKIP();
6057 	}
6058 
6059 	pthreadpool_parallelize_5d_tile_2d(
6060 		threadpool.get(),
6061 		reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(SetTrue5DTile2D),
6062 		static_cast<void*>(indicators.data()),
6063 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6064 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6065 		0 /* flags */);
6066 
6067 	for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6068 		for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6069 			for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6070 				for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6071 					for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6072 						const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6073 						EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6074 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
6075 					}
6076 				}
6077 			}
6078 		}
6079 	}
6080 }
6081 
Increment5DTile2D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6082 static void Increment5DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6083 	for (size_t l = start_l; l < start_l + tile_l; l++) {
6084 		for (size_t m = start_m; m < start_m + tile_m; m++) {
6085 			const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6086 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
6087 		}
6088 	}
6089 }
6090 
TEST(Parallelize5DTile2D,SingleThreadPoolEachItemProcessedOnce)6091 TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedOnce) {
6092 	std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6093 
6094 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6095 	ASSERT_TRUE(threadpool.get());
6096 
6097 	pthreadpool_parallelize_5d_tile_2d(
6098 		threadpool.get(),
6099 		reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6100 		static_cast<void*>(counters.data()),
6101 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6102 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6103 		0 /* flags */);
6104 
6105 	for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6106 		for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6107 			for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6108 				for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6109 					for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6110 						const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6111 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6112 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6113 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6114 					}
6115 				}
6116 			}
6117 		}
6118 	}
6119 }
6120 
TEST(Parallelize5DTile2D,MultiThreadPoolEachItemProcessedOnce)6121 TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedOnce) {
6122 	std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6123 
6124 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6125 	ASSERT_TRUE(threadpool.get());
6126 
6127 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6128 		GTEST_SKIP();
6129 	}
6130 
6131 	pthreadpool_parallelize_5d_tile_2d(
6132 		threadpool.get(),
6133 		reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6134 		static_cast<void*>(counters.data()),
6135 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6136 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6137 		0 /* flags */);
6138 
6139 	for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6140 		for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6141 			for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6142 				for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6143 					for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6144 						const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6145 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6146 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6147 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6148 					}
6149 				}
6150 			}
6151 		}
6152 	}
6153 }
6154 
TEST(Parallelize5DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)6155 TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
6156 	std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6157 
6158 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6159 	ASSERT_TRUE(threadpool.get());
6160 
6161 	for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
6162 		pthreadpool_parallelize_5d_tile_2d(
6163 			threadpool.get(),
6164 			reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6165 			static_cast<void*>(counters.data()),
6166 			kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6167 			kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6168 			0 /* flags */);
6169 	}
6170 
6171 	for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6172 		for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6173 			for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6174 				for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6175 					for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6176 						const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6177 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
6178 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6179 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
6180 							<< "(expected: " << kIncrementIterations5D << ")";
6181 					}
6182 				}
6183 			}
6184 		}
6185 	}
6186 }
6187 
TEST(Parallelize5DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)6188 TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
6189 	std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6190 
6191 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6192 	ASSERT_TRUE(threadpool.get());
6193 
6194 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6195 		GTEST_SKIP();
6196 	}
6197 
6198 	for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
6199 		pthreadpool_parallelize_5d_tile_2d(
6200 			threadpool.get(),
6201 			reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6202 			static_cast<void*>(counters.data()),
6203 			kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6204 			kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6205 			0 /* flags */);
6206 	}
6207 
6208 	for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6209 		for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6210 			for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6211 				for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6212 					for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6213 						const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6214 						EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
6215 							<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6216 							<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
6217 							<< "(expected: " << kIncrementIterations5D << ")";
6218 					}
6219 				}
6220 			}
6221 		}
6222 	}
6223 }
6224 
IncrementSame5DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6225 static void IncrementSame5DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6226 	for (size_t l = start_l; l < start_l + tile_l; l++) {
6227 		for (size_t m = start_m; m < start_m + tile_m; m++) {
6228 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
6229 		}
6230 	}
6231 }
6232 
TEST(Parallelize5DTile2D,MultiThreadPoolHighContention)6233 TEST(Parallelize5DTile2D, MultiThreadPoolHighContention) {
6234 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6235 
6236 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6237 	ASSERT_TRUE(threadpool.get());
6238 
6239 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6240 		GTEST_SKIP();
6241 	}
6242 
6243 	pthreadpool_parallelize_5d_tile_2d(
6244 		threadpool.get(),
6245 		reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(IncrementSame5DTile2D),
6246 		static_cast<void*>(&num_processed_items),
6247 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6248 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6249 		0 /* flags */);
6250 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6251 }
6252 
WorkImbalance5DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6253 static void WorkImbalance5DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6254 	num_processed_items->fetch_add(tile_l * tile_m, std::memory_order_relaxed);
6255 	if (i == 0 && j == 0 && k == 0 && start_l == 0 && start_m == 0) {
6256 		/* Spin-wait until all items are computed */
6257 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM) {
6258 			std::atomic_thread_fence(std::memory_order_acquire);
6259 		}
6260 	}
6261 }
6262 
TEST(Parallelize5DTile2D,MultiThreadPoolWorkStealing)6263 TEST(Parallelize5DTile2D, MultiThreadPoolWorkStealing) {
6264 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6265 
6266 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6267 	ASSERT_TRUE(threadpool.get());
6268 
6269 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6270 		GTEST_SKIP();
6271 	}
6272 
6273 	pthreadpool_parallelize_5d_tile_2d(
6274 		threadpool.get(),
6275 		reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(WorkImbalance5DTile2D),
6276 		static_cast<void*>(&num_processed_items),
6277 		kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6278 		kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6279 		0 /* flags */);
6280 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6281 }
6282 
ComputeNothing6D(void *,size_t,size_t,size_t,size_t,size_t,size_t)6283 static void ComputeNothing6D(void*, size_t, size_t, size_t, size_t, size_t, size_t) {
6284 }
6285 
TEST(Parallelize6D,SingleThreadPoolCompletes)6286 TEST(Parallelize6D, SingleThreadPoolCompletes) {
6287 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6288 	ASSERT_TRUE(threadpool.get());
6289 
6290 	pthreadpool_parallelize_6d(threadpool.get(),
6291 		ComputeNothing6D,
6292 		nullptr,
6293 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6294 		0 /* flags */);
6295 }
6296 
TEST(Parallelize6D,MultiThreadPoolCompletes)6297 TEST(Parallelize6D, MultiThreadPoolCompletes) {
6298 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6299 	ASSERT_TRUE(threadpool.get());
6300 
6301 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6302 		GTEST_SKIP();
6303 	}
6304 
6305 	pthreadpool_parallelize_6d(
6306 		threadpool.get(),
6307 		ComputeNothing6D,
6308 		nullptr,
6309 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6310 		0 /* flags */);
6311 }
6312 
CheckBounds6D(void *,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6313 static void CheckBounds6D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6314 	EXPECT_LT(i, kParallelize6DRangeI);
6315 	EXPECT_LT(j, kParallelize6DRangeJ);
6316 	EXPECT_LT(k, kParallelize6DRangeK);
6317 	EXPECT_LT(l, kParallelize6DRangeL);
6318 	EXPECT_LT(m, kParallelize6DRangeM);
6319 	EXPECT_LT(n, kParallelize6DRangeN);
6320 }
6321 
TEST(Parallelize6D,SingleThreadPoolAllItemsInBounds)6322 TEST(Parallelize6D, SingleThreadPoolAllItemsInBounds) {
6323 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6324 	ASSERT_TRUE(threadpool.get());
6325 
6326 	pthreadpool_parallelize_6d(
6327 		threadpool.get(),
6328 		CheckBounds6D,
6329 		nullptr,
6330 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6331 		0 /* flags */);
6332 }
6333 
TEST(Parallelize6D,MultiThreadPoolAllItemsInBounds)6334 TEST(Parallelize6D, MultiThreadPoolAllItemsInBounds) {
6335 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6336 	ASSERT_TRUE(threadpool.get());
6337 
6338 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6339 		GTEST_SKIP();
6340 	}
6341 
6342 	pthreadpool_parallelize_6d(
6343 		threadpool.get(),
6344 		CheckBounds6D,
6345 		nullptr,
6346 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6347 		0 /* flags */);
6348 }
6349 
SetTrue6D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6350 static void SetTrue6D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6351 	const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6352 	processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
6353 }
6354 
TEST(Parallelize6D,SingleThreadPoolAllItemsProcessed)6355 TEST(Parallelize6D, SingleThreadPoolAllItemsProcessed) {
6356 	std::vector<std::atomic_bool> indicators(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6357 
6358 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6359 	ASSERT_TRUE(threadpool.get());
6360 
6361 	pthreadpool_parallelize_6d(
6362 		threadpool.get(),
6363 		reinterpret_cast<pthreadpool_task_6d_t>(SetTrue6D),
6364 		static_cast<void*>(indicators.data()),
6365 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6366 		0 /* flags */);
6367 
6368 	for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6369 		for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6370 			for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6371 				for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6372 					for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6373 						for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6374 							const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6375 							EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6376 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6377 						}
6378 					}
6379 				}
6380 			}
6381 		}
6382 	}
6383 }
6384 
TEST(Parallelize6D,MultiThreadPoolAllItemsProcessed)6385 TEST(Parallelize6D, MultiThreadPoolAllItemsProcessed) {
6386 	std::vector<std::atomic_bool> indicators(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6387 
6388 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6389 	ASSERT_TRUE(threadpool.get());
6390 
6391 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6392 		GTEST_SKIP();
6393 	}
6394 
6395 	pthreadpool_parallelize_6d(
6396 		threadpool.get(),
6397 		reinterpret_cast<pthreadpool_task_6d_t>(SetTrue6D),
6398 		static_cast<void*>(indicators.data()),
6399 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6400 		0 /* flags */);
6401 
6402 	for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6403 		for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6404 			for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6405 				for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6406 					for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6407 						for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6408 							const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6409 							EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6410 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6411 						}
6412 					}
6413 				}
6414 			}
6415 		}
6416 	}
6417 }
6418 
Increment6D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6419 static void Increment6D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6420 	const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6421 	processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
6422 }
6423 
TEST(Parallelize6D,SingleThreadPoolEachItemProcessedOnce)6424 TEST(Parallelize6D, SingleThreadPoolEachItemProcessedOnce) {
6425 	std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6426 
6427 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6428 	ASSERT_TRUE(threadpool.get());
6429 
6430 	pthreadpool_parallelize_6d(
6431 		threadpool.get(),
6432 		reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6433 		static_cast<void*>(counters.data()),
6434 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6435 		0 /* flags */);
6436 
6437 	for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6438 		for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6439 			for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6440 				for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6441 					for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6442 						for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6443 							const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6444 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6445 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6446 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6447 						}
6448 					}
6449 				}
6450 			}
6451 		}
6452 	}
6453 }
6454 
TEST(Parallelize6D,MultiThreadPoolEachItemProcessedOnce)6455 TEST(Parallelize6D, MultiThreadPoolEachItemProcessedOnce) {
6456 	std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6457 
6458 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6459 	ASSERT_TRUE(threadpool.get());
6460 
6461 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6462 		GTEST_SKIP();
6463 	}
6464 
6465 	pthreadpool_parallelize_6d(
6466 		threadpool.get(),
6467 		reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6468 		static_cast<void*>(counters.data()),
6469 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6470 		0 /* flags */);
6471 
6472 	for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6473 		for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6474 			for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6475 				for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6476 					for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6477 						for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6478 							const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6479 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6480 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6481 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6482 						}
6483 					}
6484 				}
6485 			}
6486 		}
6487 	}
6488 }
6489 
TEST(Parallelize6D,SingleThreadPoolEachItemProcessedMultipleTimes)6490 TEST(Parallelize6D, SingleThreadPoolEachItemProcessedMultipleTimes) {
6491 	std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6492 
6493 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6494 	ASSERT_TRUE(threadpool.get());
6495 
6496 	for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6497 		pthreadpool_parallelize_6d(
6498 			threadpool.get(),
6499 			reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6500 			static_cast<void*>(counters.data()),
6501 			kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6502 				0 /* flags */);
6503 	}
6504 
6505 	for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6506 		for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6507 			for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6508 				for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6509 					for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6510 						for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6511 							const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN;
6512 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6513 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6514 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
6515 								<< "(expected: " << kIncrementIterations6D << ")";
6516 						}
6517 					}
6518 				}
6519 			}
6520 		}
6521 	}
6522 }
6523 
TEST(Parallelize6D,MultiThreadPoolEachItemProcessedMultipleTimes)6524 TEST(Parallelize6D, MultiThreadPoolEachItemProcessedMultipleTimes) {
6525 	std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6526 
6527 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6528 	ASSERT_TRUE(threadpool.get());
6529 
6530 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6531 		GTEST_SKIP();
6532 	}
6533 
6534 	for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6535 		pthreadpool_parallelize_6d(
6536 			threadpool.get(),
6537 			reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6538 			static_cast<void*>(counters.data()),
6539 			kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6540 				0 /* flags */);
6541 	}
6542 
6543 	for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6544 		for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6545 			for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6546 				for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6547 					for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6548 						for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6549 							const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6550 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6551 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6552 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
6553 								<< "(expected: " << kIncrementIterations6D << ")";
6554 						}
6555 					}
6556 				}
6557 			}
6558 		}
6559 	}
6560 }
6561 
IncrementSame6D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6562 static void IncrementSame6D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6563 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
6564 }
6565 
TEST(Parallelize6D,MultiThreadPoolHighContention)6566 TEST(Parallelize6D, MultiThreadPoolHighContention) {
6567 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6568 
6569 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6570 	ASSERT_TRUE(threadpool.get());
6571 
6572 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6573 		GTEST_SKIP();
6574 	}
6575 
6576 	pthreadpool_parallelize_6d(
6577 		threadpool.get(),
6578 		reinterpret_cast<pthreadpool_task_6d_t>(IncrementSame6D),
6579 		static_cast<void*>(&num_processed_items),
6580 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6581 		0 /* flags */);
6582 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6583 }
6584 
WorkImbalance6D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6585 static void WorkImbalance6D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6586 	num_processed_items->fetch_add(1, std::memory_order_relaxed);
6587 	if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && n == 0) {
6588 		/* Spin-wait until all items are computed */
6589 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN) {
6590 			std::atomic_thread_fence(std::memory_order_acquire);
6591 		}
6592 	}
6593 }
6594 
TEST(Parallelize6D,MultiThreadPoolWorkStealing)6595 TEST(Parallelize6D, MultiThreadPoolWorkStealing) {
6596 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6597 
6598 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6599 	ASSERT_TRUE(threadpool.get());
6600 
6601 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6602 		GTEST_SKIP();
6603 	}
6604 
6605 	pthreadpool_parallelize_6d(
6606 		threadpool.get(),
6607 		reinterpret_cast<pthreadpool_task_6d_t>(WorkImbalance6D),
6608 		static_cast<void*>(&num_processed_items),
6609 		kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6610 		0 /* flags */);
6611 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6612 }
6613 
ComputeNothing6DTile1D(void *,size_t,size_t,size_t,size_t,size_t,size_t,size_t)6614 static void ComputeNothing6DTile1D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) {
6615 }
6616 
TEST(Parallelize6DTile1D,SingleThreadPoolCompletes)6617 TEST(Parallelize6DTile1D, SingleThreadPoolCompletes) {
6618 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6619 	ASSERT_TRUE(threadpool.get());
6620 
6621 	pthreadpool_parallelize_6d_tile_1d(threadpool.get(),
6622 		ComputeNothing6DTile1D,
6623 		nullptr,
6624 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6625 		kParallelize6DTile1DTileN,
6626 		0 /* flags */);
6627 }
6628 
TEST(Parallelize6DTile1D,MultiThreadPoolCompletes)6629 TEST(Parallelize6DTile1D, MultiThreadPoolCompletes) {
6630 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6631 	ASSERT_TRUE(threadpool.get());
6632 
6633 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6634 		GTEST_SKIP();
6635 	}
6636 
6637 	pthreadpool_parallelize_6d_tile_1d(
6638 		threadpool.get(),
6639 		ComputeNothing6DTile1D,
6640 		nullptr,
6641 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6642 		kParallelize6DTile1DTileN,
6643 		0 /* flags */);
6644 }
6645 
CheckBounds6DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6646 static void CheckBounds6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6647 	EXPECT_LT(i, kParallelize6DTile1DRangeI);
6648 	EXPECT_LT(j, kParallelize6DTile1DRangeJ);
6649 	EXPECT_LT(k, kParallelize6DTile1DRangeK);
6650 	EXPECT_LT(l, kParallelize6DTile1DRangeL);
6651 	EXPECT_LT(m, kParallelize6DTile1DRangeM);
6652 	EXPECT_LT(start_n, kParallelize6DTile1DRangeN);
6653 	EXPECT_LE(start_n + tile_n, kParallelize6DTile1DRangeN);
6654 }
6655 
TEST(Parallelize6DTile1D,SingleThreadPoolAllItemsInBounds)6656 TEST(Parallelize6DTile1D, SingleThreadPoolAllItemsInBounds) {
6657 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6658 	ASSERT_TRUE(threadpool.get());
6659 
6660 	pthreadpool_parallelize_6d_tile_1d(
6661 		threadpool.get(),
6662 		CheckBounds6DTile1D,
6663 		nullptr,
6664 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6665 		kParallelize6DTile1DTileN,
6666 		0 /* flags */);
6667 }
6668 
TEST(Parallelize6DTile1D,MultiThreadPoolAllItemsInBounds)6669 TEST(Parallelize6DTile1D, MultiThreadPoolAllItemsInBounds) {
6670 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6671 	ASSERT_TRUE(threadpool.get());
6672 
6673 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6674 		GTEST_SKIP();
6675 	}
6676 
6677 	pthreadpool_parallelize_6d_tile_1d(
6678 		threadpool.get(),
6679 		CheckBounds6DTile1D,
6680 		nullptr,
6681 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6682 		kParallelize6DTile1DTileN,
6683 		0 /* flags */);
6684 }
6685 
CheckTiling6DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6686 static void CheckTiling6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6687 	EXPECT_GT(tile_n, 0);
6688 	EXPECT_LE(tile_n, kParallelize6DTile1DTileN);
6689 	EXPECT_EQ(start_n % kParallelize6DTile1DTileN, 0);
6690 	EXPECT_EQ(tile_n, std::min<size_t>(kParallelize6DTile1DTileN, kParallelize6DTile1DRangeN - start_n));
6691 }
6692 
TEST(Parallelize6DTile1D,SingleThreadPoolUniformTiling)6693 TEST(Parallelize6DTile1D, SingleThreadPoolUniformTiling) {
6694 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6695 	ASSERT_TRUE(threadpool.get());
6696 
6697 	pthreadpool_parallelize_6d_tile_1d(
6698 		threadpool.get(),
6699 		CheckTiling6DTile1D,
6700 		nullptr,
6701 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6702 		kParallelize6DTile1DTileN,
6703 		0 /* flags */);
6704 }
6705 
TEST(Parallelize6DTile1D,MultiThreadPoolUniformTiling)6706 TEST(Parallelize6DTile1D, MultiThreadPoolUniformTiling) {
6707 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6708 	ASSERT_TRUE(threadpool.get());
6709 
6710 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6711 		GTEST_SKIP();
6712 	}
6713 
6714 	pthreadpool_parallelize_6d_tile_1d(
6715 		threadpool.get(),
6716 		CheckTiling6DTile1D,
6717 		nullptr,
6718 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6719 		kParallelize6DTile1DTileN,
6720 		0 /* flags */);
6721 }
6722 
SetTrue6DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6723 static void SetTrue6DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6724 	for (size_t n = start_n; n < start_n + tile_n; n++) {
6725 		const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6726 		processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
6727 	}
6728 }
6729 
TEST(Parallelize6DTile1D,SingleThreadPoolAllItemsProcessed)6730 TEST(Parallelize6DTile1D, SingleThreadPoolAllItemsProcessed) {
6731 	std::vector<std::atomic_bool> indicators(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6732 
6733 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6734 	ASSERT_TRUE(threadpool.get());
6735 
6736 	pthreadpool_parallelize_6d_tile_1d(
6737 		threadpool.get(),
6738 		reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(SetTrue6DTile1D),
6739 		static_cast<void*>(indicators.data()),
6740 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6741 		kParallelize6DTile1DTileN,
6742 		0 /* flags */);
6743 
6744 	for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6745 		for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6746 			for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6747 				for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6748 					for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6749 						for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6750 							const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6751 							EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6752 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6753 						}
6754 					}
6755 				}
6756 			}
6757 		}
6758 	}
6759 }
6760 
TEST(Parallelize6DTile1D,MultiThreadPoolAllItemsProcessed)6761 TEST(Parallelize6DTile1D, MultiThreadPoolAllItemsProcessed) {
6762 	std::vector<std::atomic_bool> indicators(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6763 
6764 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6765 	ASSERT_TRUE(threadpool.get());
6766 
6767 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6768 		GTEST_SKIP();
6769 	}
6770 
6771 	pthreadpool_parallelize_6d_tile_1d(
6772 		threadpool.get(),
6773 		reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(SetTrue6DTile1D),
6774 		static_cast<void*>(indicators.data()),
6775 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6776 		kParallelize6DTile1DTileN,
6777 		0 /* flags */);
6778 
6779 	for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6780 		for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6781 			for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6782 				for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6783 					for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6784 						for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6785 							const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6786 							EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6787 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6788 						}
6789 					}
6790 				}
6791 			}
6792 		}
6793 	}
6794 }
6795 
Increment6DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6796 static void Increment6DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6797 	for (size_t n = start_n; n < start_n + tile_n; n++) {
6798 		const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6799 		processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
6800 	}
6801 }
6802 
TEST(Parallelize6DTile1D,SingleThreadPoolEachItemProcessedOnce)6803 TEST(Parallelize6DTile1D, SingleThreadPoolEachItemProcessedOnce) {
6804 	std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6805 
6806 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6807 	ASSERT_TRUE(threadpool.get());
6808 
6809 	pthreadpool_parallelize_6d_tile_1d(
6810 		threadpool.get(),
6811 		reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6812 		static_cast<void*>(counters.data()),
6813 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6814 		kParallelize6DTile1DTileN,
6815 		0 /* flags */);
6816 
6817 	for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6818 		for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6819 			for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6820 				for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6821 					for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6822 						for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6823 							const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6824 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6825 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6826 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6827 						}
6828 					}
6829 				}
6830 			}
6831 		}
6832 	}
6833 }
6834 
TEST(Parallelize6DTile1D,MultiThreadPoolEachItemProcessedOnce)6835 TEST(Parallelize6DTile1D, MultiThreadPoolEachItemProcessedOnce) {
6836 	std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6837 
6838 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6839 	ASSERT_TRUE(threadpool.get());
6840 
6841 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6842 		GTEST_SKIP();
6843 	}
6844 
6845 	pthreadpool_parallelize_6d_tile_1d(
6846 		threadpool.get(),
6847 		reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6848 		static_cast<void*>(counters.data()),
6849 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6850 		kParallelize6DTile1DTileN,
6851 		0 /* flags */);
6852 
6853 	for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6854 		for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6855 			for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6856 				for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6857 					for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6858 						for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6859 							const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6860 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6861 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6862 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6863 						}
6864 					}
6865 				}
6866 			}
6867 		}
6868 	}
6869 }
6870 
TEST(Parallelize6DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)6871 TEST(Parallelize6DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
6872 	std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6873 
6874 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6875 	ASSERT_TRUE(threadpool.get());
6876 
6877 	for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6878 		pthreadpool_parallelize_6d_tile_1d(
6879 			threadpool.get(),
6880 			reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6881 			static_cast<void*>(counters.data()),
6882 			kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6883 			kParallelize6DTile1DTileN,
6884 			0 /* flags */);
6885 	}
6886 
6887 	for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6888 		for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6889 			for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6890 				for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6891 					for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6892 						for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6893 							const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6894 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6895 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6896 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
6897 								<< "(expected: " << kIncrementIterations6D << ")";
6898 						}
6899 					}
6900 				}
6901 			}
6902 		}
6903 	}
6904 }
6905 
TEST(Parallelize6DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)6906 TEST(Parallelize6DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
6907 	std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6908 
6909 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6910 	ASSERT_TRUE(threadpool.get());
6911 
6912 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6913 		GTEST_SKIP();
6914 	}
6915 
6916 	for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6917 		pthreadpool_parallelize_6d_tile_1d(
6918 			threadpool.get(),
6919 			reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6920 			static_cast<void*>(counters.data()),
6921 			kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6922 			kParallelize6DTile1DTileN,
6923 			0 /* flags */);
6924 	}
6925 
6926 	for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6927 		for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6928 			for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6929 				for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6930 					for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6931 						for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6932 							const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6933 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6934 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6935 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
6936 								<< "(expected: " << kIncrementIterations6D << ")";
6937 						}
6938 					}
6939 				}
6940 			}
6941 		}
6942 	}
6943 }
6944 
IncrementSame6DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6945 static void IncrementSame6DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6946 	for (size_t n = start_n; n < start_n + tile_n; n++) {
6947 		num_processed_items->fetch_add(1, std::memory_order_relaxed);
6948 	}
6949 }
6950 
TEST(Parallelize6DTile1D,MultiThreadPoolHighContention)6951 TEST(Parallelize6DTile1D, MultiThreadPoolHighContention) {
6952 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6953 
6954 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6955 	ASSERT_TRUE(threadpool.get());
6956 
6957 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6958 		GTEST_SKIP();
6959 	}
6960 
6961 	pthreadpool_parallelize_6d_tile_1d(
6962 		threadpool.get(),
6963 		reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(IncrementSame6DTile1D),
6964 		static_cast<void*>(&num_processed_items),
6965 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6966 		kParallelize6DTile1DTileN,
6967 		0 /* flags */);
6968 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6969 }
6970 
WorkImbalance6DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6971 static void WorkImbalance6DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6972 	num_processed_items->fetch_add(tile_n, std::memory_order_relaxed);
6973 	if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && start_n == 0) {
6974 		/* Spin-wait until all items are computed */
6975 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN) {
6976 			std::atomic_thread_fence(std::memory_order_acquire);
6977 		}
6978 	}
6979 }
6980 
TEST(Parallelize6DTile1D,MultiThreadPoolWorkStealing)6981 TEST(Parallelize6DTile1D, MultiThreadPoolWorkStealing) {
6982 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6983 
6984 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6985 	ASSERT_TRUE(threadpool.get());
6986 
6987 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6988 		GTEST_SKIP();
6989 	}
6990 
6991 	pthreadpool_parallelize_6d_tile_1d(
6992 		threadpool.get(),
6993 		reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(WorkImbalance6DTile1D),
6994 		static_cast<void*>(&num_processed_items),
6995 		kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6996 		kParallelize6DTile1DTileN,
6997 		0 /* flags */);
6998 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6999 }
7000 
ComputeNothing6DTile2D(void *,size_t,size_t,size_t,size_t,size_t,size_t,size_t,size_t)7001 static void ComputeNothing6DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) {
7002 }
7003 
TEST(Parallelize6DTile2D,SingleThreadPoolCompletes)7004 TEST(Parallelize6DTile2D, SingleThreadPoolCompletes) {
7005 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7006 	ASSERT_TRUE(threadpool.get());
7007 
7008 	pthreadpool_parallelize_6d_tile_2d(threadpool.get(),
7009 		ComputeNothing6DTile2D,
7010 		nullptr,
7011 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7012 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7013 		0 /* flags */);
7014 }
7015 
TEST(Parallelize6DTile2D,MultiThreadPoolCompletes)7016 TEST(Parallelize6DTile2D, MultiThreadPoolCompletes) {
7017 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7018 	ASSERT_TRUE(threadpool.get());
7019 
7020 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7021 		GTEST_SKIP();
7022 	}
7023 
7024 	pthreadpool_parallelize_6d_tile_2d(
7025 		threadpool.get(),
7026 		ComputeNothing6DTile2D,
7027 		nullptr,
7028 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7029 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7030 		0 /* flags */);
7031 }
7032 
CheckBounds6DTile2D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7033 static void CheckBounds6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7034 	EXPECT_LT(i, kParallelize6DTile2DRangeI);
7035 	EXPECT_LT(j, kParallelize6DTile2DRangeJ);
7036 	EXPECT_LT(k, kParallelize6DTile2DRangeK);
7037 	EXPECT_LT(l, kParallelize6DTile2DRangeL);
7038 	EXPECT_LT(start_m, kParallelize6DTile2DRangeM);
7039 	EXPECT_LT(start_n, kParallelize6DTile2DRangeN);
7040 	EXPECT_LE(start_m + tile_m, kParallelize6DTile2DRangeM);
7041 	EXPECT_LE(start_n + tile_n, kParallelize6DTile2DRangeN);
7042 }
7043 
TEST(Parallelize6DTile2D,SingleThreadPoolAllItemsInBounds)7044 TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsInBounds) {
7045 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7046 	ASSERT_TRUE(threadpool.get());
7047 
7048 	pthreadpool_parallelize_6d_tile_2d(
7049 		threadpool.get(),
7050 		CheckBounds6DTile2D,
7051 		nullptr,
7052 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7053 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7054 		0 /* flags */);
7055 }
7056 
TEST(Parallelize6DTile2D,MultiThreadPoolAllItemsInBounds)7057 TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsInBounds) {
7058 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7059 	ASSERT_TRUE(threadpool.get());
7060 
7061 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7062 		GTEST_SKIP();
7063 	}
7064 
7065 	pthreadpool_parallelize_6d_tile_2d(
7066 		threadpool.get(),
7067 		CheckBounds6DTile2D,
7068 		nullptr,
7069 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7070 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7071 		0 /* flags */);
7072 }
7073 
CheckTiling6DTile2D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7074 static void CheckTiling6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7075 	EXPECT_GT(tile_m, 0);
7076 	EXPECT_LE(tile_m, kParallelize6DTile2DTileM);
7077 	EXPECT_EQ(start_m % kParallelize6DTile2DTileM, 0);
7078 	EXPECT_EQ(tile_m, std::min<size_t>(kParallelize6DTile2DTileM, kParallelize6DTile2DRangeM - start_m));
7079 
7080 	EXPECT_GT(tile_n, 0);
7081 	EXPECT_LE(tile_n, kParallelize6DTile2DTileN);
7082 	EXPECT_EQ(start_n % kParallelize6DTile2DTileN, 0);
7083 	EXPECT_EQ(tile_n, std::min<size_t>(kParallelize6DTile2DTileN, kParallelize6DTile2DRangeN - start_n));
7084 }
7085 
TEST(Parallelize6DTile2D,SingleThreadPoolUniformTiling)7086 TEST(Parallelize6DTile2D, SingleThreadPoolUniformTiling) {
7087 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7088 	ASSERT_TRUE(threadpool.get());
7089 
7090 	pthreadpool_parallelize_6d_tile_2d(
7091 		threadpool.get(),
7092 		CheckTiling6DTile2D,
7093 		nullptr,
7094 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7095 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7096 		0 /* flags */);
7097 }
7098 
TEST(Parallelize6DTile2D,MultiThreadPoolUniformTiling)7099 TEST(Parallelize6DTile2D, MultiThreadPoolUniformTiling) {
7100 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7101 	ASSERT_TRUE(threadpool.get());
7102 
7103 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7104 		GTEST_SKIP();
7105 	}
7106 
7107 	pthreadpool_parallelize_6d_tile_2d(
7108 		threadpool.get(),
7109 		CheckTiling6DTile2D,
7110 		nullptr,
7111 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7112 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7113 		0 /* flags */);
7114 }
7115 
SetTrue6DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7116 static void SetTrue6DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7117 	for (size_t m = start_m; m < start_m + tile_m; m++) {
7118 		for (size_t n = start_n; n < start_n + tile_n; n++) {
7119 			const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7120 			processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
7121 		}
7122 	}
7123 }
7124 
TEST(Parallelize6DTile2D,SingleThreadPoolAllItemsProcessed)7125 TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsProcessed) {
7126 	std::vector<std::atomic_bool> indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7127 
7128 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7129 	ASSERT_TRUE(threadpool.get());
7130 
7131 	pthreadpool_parallelize_6d_tile_2d(
7132 		threadpool.get(),
7133 		reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(SetTrue6DTile2D),
7134 		static_cast<void*>(indicators.data()),
7135 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7136 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7137 		0 /* flags */);
7138 
7139 	for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7140 		for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7141 			for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7142 				for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7143 					for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7144 						for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7145 							const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7146 							EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
7147 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
7148 						}
7149 					}
7150 				}
7151 			}
7152 		}
7153 	}
7154 }
7155 
TEST(Parallelize6DTile2D,MultiThreadPoolAllItemsProcessed)7156 TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsProcessed) {
7157 	std::vector<std::atomic_bool> indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7158 
7159 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7160 	ASSERT_TRUE(threadpool.get());
7161 
7162 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7163 		GTEST_SKIP();
7164 	}
7165 
7166 	pthreadpool_parallelize_6d_tile_2d(
7167 		threadpool.get(),
7168 		reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(SetTrue6DTile2D),
7169 		static_cast<void*>(indicators.data()),
7170 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7171 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7172 		0 /* flags */);
7173 
7174 	for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7175 		for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7176 			for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7177 				for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7178 					for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7179 						for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7180 							const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7181 							EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
7182 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
7183 						}
7184 					}
7185 				}
7186 			}
7187 		}
7188 	}
7189 }
7190 
Increment6DTile2D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7191 static void Increment6DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7192 	for (size_t m = start_m; m < start_m + tile_m; m++) {
7193 		for (size_t n = start_n; n < start_n + tile_n; n++) {
7194 			const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7195 			processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
7196 		}
7197 	}
7198 }
7199 
TEST(Parallelize6DTile2D,SingleThreadPoolEachItemProcessedOnce)7200 TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedOnce) {
7201 	std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7202 
7203 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7204 	ASSERT_TRUE(threadpool.get());
7205 
7206 	pthreadpool_parallelize_6d_tile_2d(
7207 		threadpool.get(),
7208 		reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7209 		static_cast<void*>(counters.data()),
7210 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7211 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7212 		0 /* flags */);
7213 
7214 	for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7215 		for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7216 			for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7217 				for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7218 					for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7219 						for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7220 							const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7221 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
7222 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7223 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
7224 						}
7225 					}
7226 				}
7227 			}
7228 		}
7229 	}
7230 }
7231 
TEST(Parallelize6DTile2D,MultiThreadPoolEachItemProcessedOnce)7232 TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedOnce) {
7233 	std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7234 
7235 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7236 	ASSERT_TRUE(threadpool.get());
7237 
7238 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7239 		GTEST_SKIP();
7240 	}
7241 
7242 	pthreadpool_parallelize_6d_tile_2d(
7243 		threadpool.get(),
7244 		reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7245 		static_cast<void*>(counters.data()),
7246 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7247 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7248 		0 /* flags */);
7249 
7250 	for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7251 		for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7252 			for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7253 				for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7254 					for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7255 						for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7256 							const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7257 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
7258 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7259 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
7260 						}
7261 					}
7262 				}
7263 			}
7264 		}
7265 	}
7266 }
7267 
TEST(Parallelize6DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)7268 TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
7269 	std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7270 
7271 	auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7272 	ASSERT_TRUE(threadpool.get());
7273 
7274 	for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
7275 		pthreadpool_parallelize_6d_tile_2d(
7276 			threadpool.get(),
7277 			reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7278 			static_cast<void*>(counters.data()),
7279 			kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7280 			kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7281 			0 /* flags */);
7282 	}
7283 
7284 	for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7285 		for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7286 			for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7287 				for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7288 					for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7289 						for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7290 							const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7291 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
7292 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7293 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
7294 								<< "(expected: " << kIncrementIterations6D << ")";
7295 						}
7296 					}
7297 				}
7298 			}
7299 		}
7300 	}
7301 }
7302 
TEST(Parallelize6DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)7303 TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
7304 	std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7305 
7306 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7307 	ASSERT_TRUE(threadpool.get());
7308 
7309 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7310 		GTEST_SKIP();
7311 	}
7312 
7313 	for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
7314 		pthreadpool_parallelize_6d_tile_2d(
7315 			threadpool.get(),
7316 			reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7317 			static_cast<void*>(counters.data()),
7318 			kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7319 			kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7320 			0 /* flags */);
7321 	}
7322 
7323 	for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7324 		for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7325 			for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7326 				for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7327 					for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7328 						for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7329 							const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7330 							EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
7331 								<< "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7332 								<< counters[linear_idx].load(std::memory_order_relaxed) << " times "
7333 								<< "(expected: " << kIncrementIterations6D << ")";
7334 						}
7335 					}
7336 				}
7337 			}
7338 		}
7339 	}
7340 }
7341 
IncrementSame6DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7342 static void IncrementSame6DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7343 	for (size_t m = start_m; m < start_m + tile_m; m++) {
7344 		for (size_t n = start_n; n < start_n + tile_n; n++) {
7345 			num_processed_items->fetch_add(1, std::memory_order_relaxed);
7346 		}
7347 	}
7348 }
7349 
TEST(Parallelize6DTile2D,MultiThreadPoolHighContention)7350 TEST(Parallelize6DTile2D, MultiThreadPoolHighContention) {
7351 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
7352 
7353 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7354 	ASSERT_TRUE(threadpool.get());
7355 
7356 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7357 		GTEST_SKIP();
7358 	}
7359 
7360 	pthreadpool_parallelize_6d_tile_2d(
7361 		threadpool.get(),
7362 		reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(IncrementSame6DTile2D),
7363 		static_cast<void*>(&num_processed_items),
7364 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7365 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7366 		0 /* flags */);
7367 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7368 }
7369 
WorkImbalance6DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7370 static void WorkImbalance6DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7371 	num_processed_items->fetch_add(tile_m * tile_n, std::memory_order_relaxed);
7372 	if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0 && start_n == 0) {
7373 		/* Spin-wait until all items are computed */
7374 		while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN) {
7375 			std::atomic_thread_fence(std::memory_order_acquire);
7376 		}
7377 	}
7378 }
7379 
TEST(Parallelize6DTile2D,MultiThreadPoolWorkStealing)7380 TEST(Parallelize6DTile2D, MultiThreadPoolWorkStealing) {
7381 	std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
7382 
7383 	auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7384 	ASSERT_TRUE(threadpool.get());
7385 
7386 	if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7387 		GTEST_SKIP();
7388 	}
7389 
7390 	pthreadpool_parallelize_6d_tile_2d(
7391 		threadpool.get(),
7392 		reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(WorkImbalance6DTile2D),
7393 		static_cast<void*>(&num_processed_items),
7394 		kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7395 		kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7396 		0 /* flags */);
7397 	EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7398 }
7399