• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Standard C headers */
2 #include <assert.h>
3 #include <stdbool.h>
4 #include <stdint.h>
5 #include <stdlib.h>
6 #include <string.h>
7 
8 #if PTHREADPOOL_USE_CPUINFO
9 	#include <cpuinfo.h>
10 #endif
11 
12 /* Dependencies */
13 #include <fxdiv.h>
14 
15 /* Public library header */
16 #include <pthreadpool.h>
17 
18 /* Internal library headers */
19 #include "threadpool-atomics.h"
20 #include "threadpool-object.h"
21 #include "threadpool-utils.h"
22 
23 
pthreadpool_get_threads_count(struct pthreadpool * threadpool)24 size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) {
25 	if (threadpool == NULL) {
26 		return 1;
27 	}
28 
29 	return threadpool->threads_count.value;
30 }
31 
thread_parallelize_1d(struct pthreadpool * threadpool,struct thread_info * thread)32 static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
33 	assert(threadpool != NULL);
34 	assert(thread != NULL);
35 
36 	const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
37 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
38 
39 	/* Process thread's own range of items */
40 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
41 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
42 		task(argument, range_start++);
43 	}
44 
45 	/* There still may be other threads with work */
46 	const size_t thread_number = thread->thread_number;
47 	const size_t threads_count = threadpool->threads_count.value;
48 	for (size_t tid = modulo_decrement(thread_number, threads_count);
49 		tid != thread_number;
50 		tid = modulo_decrement(tid, threads_count))
51 	{
52 		struct thread_info* other_thread = &threadpool->threads[tid];
53 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
54 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
55 			task(argument, index);
56 		}
57 	}
58 
59 	/* Make changes by this thread visible to other threads */
60 	pthreadpool_fence_release();
61 }
62 
thread_parallelize_1d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)63 static void thread_parallelize_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
64 	assert(threadpool != NULL);
65 	assert(thread != NULL);
66 
67 	const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
68 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
69 
70 	const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index;
71 	uint32_t uarch_index = default_uarch_index;
72 	#if PTHREADPOOL_USE_CPUINFO
73 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
74 		if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) {
75 			uarch_index = default_uarch_index;
76 		}
77 	#endif
78 
79 	/* Process thread's own range of items */
80 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
81 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
82 		task(argument, uarch_index, range_start++);
83 	}
84 
85 	/* There still may be other threads with work */
86 	const size_t thread_number = thread->thread_number;
87 	const size_t threads_count = threadpool->threads_count.value;
88 	for (size_t tid = modulo_decrement(thread_number, threads_count);
89 		tid != thread_number;
90 		tid = modulo_decrement(tid, threads_count))
91 	{
92 		struct thread_info* other_thread = &threadpool->threads[tid];
93 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
94 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
95 			task(argument, uarch_index, index);
96 		}
97 	}
98 
99 	/* Make changes by this thread visible to other threads */
100 	pthreadpool_fence_release();
101 }
102 
thread_parallelize_1d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)103 static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
104 	assert(threadpool != NULL);
105 	assert(thread != NULL);
106 
107 	const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
108 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
109 
110 	/* Process thread's own range of items */
111 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
112 	const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile;
113 	size_t tile_start = range_start * tile;
114 
115 	const size_t range = threadpool->params.parallelize_1d_tile_1d.range;
116 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
117 		task(argument, tile_start, min(range - tile_start, tile));
118 		tile_start += tile;
119 	}
120 
121 	/* There still may be other threads with work */
122 	const size_t thread_number = thread->thread_number;
123 	const size_t threads_count = threadpool->threads_count.value;
124 	for (size_t tid = modulo_decrement(thread_number, threads_count);
125 		tid != thread_number;
126 		tid = modulo_decrement(tid, threads_count))
127 	{
128 		struct thread_info* other_thread = &threadpool->threads[tid];
129 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
130 			const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
131 			const size_t tile_start = tile_index * tile;
132 			task(argument, tile_start, min(range - tile_start, tile));
133 		}
134 	}
135 
136 	/* Make changes by this thread visible to other threads */
137 	pthreadpool_fence_release();
138 }
139 
thread_parallelize_2d(struct pthreadpool * threadpool,struct thread_info * thread)140 static void thread_parallelize_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
141 	assert(threadpool != NULL);
142 	assert(thread != NULL);
143 
144 	const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
145 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
146 
147 	/* Process thread's own range of items */
148 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
149 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j;
150 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j);
151 	size_t i = index_i_j.quotient;
152 	size_t j = index_i_j.remainder;
153 
154 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
155 		task(argument, i, j);
156 		if (++j == range_j.value) {
157 			j = 0;
158 			i += 1;
159 		}
160 	}
161 
162 	/* There still may be other threads with work */
163 	const size_t thread_number = thread->thread_number;
164 	const size_t threads_count = threadpool->threads_count.value;
165 	for (size_t tid = modulo_decrement(thread_number, threads_count);
166 		tid != thread_number;
167 		tid = modulo_decrement(tid, threads_count))
168 	{
169 		struct thread_info* other_thread = &threadpool->threads[tid];
170 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
171 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
172 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j);
173 			task(argument, index_i_j.quotient, index_i_j.remainder);
174 		}
175 	}
176 
177 	/* Make changes by this thread visible to other threads */
178 	pthreadpool_fence_release();
179 }
180 
thread_parallelize_2d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)181 static void thread_parallelize_2d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
182 	assert(threadpool != NULL);
183 	assert(thread != NULL);
184 
185 	const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
186 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
187 
188 	/* Process thread's own range of items */
189 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
190 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j;
191 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
192 	const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j;
193 	size_t i = tile_index_i_j.quotient;
194 	size_t start_j = tile_index_i_j.remainder * tile_j;
195 
196 	const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j;
197 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
198 		task(argument, i, start_j, min(range_j - start_j, tile_j));
199 		start_j += tile_j;
200 		if (start_j >= range_j) {
201 			start_j = 0;
202 			i += 1;
203 		}
204 	}
205 
206 	/* There still may be other threads with work */
207 	const size_t thread_number = thread->thread_number;
208 	const size_t threads_count = threadpool->threads_count.value;
209 	for (size_t tid = modulo_decrement(thread_number, threads_count);
210 		tid != thread_number;
211 		tid = modulo_decrement(tid, threads_count))
212 	{
213 		struct thread_info* other_thread = &threadpool->threads[tid];
214 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
215 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
216 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
217 			const size_t start_j = tile_index_i_j.remainder * tile_j;
218 			task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j));
219 		}
220 	}
221 
222 	/* Make changes by this thread visible to other threads */
223 	pthreadpool_fence_release();
224 }
225 
thread_parallelize_2d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)226 static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
227 	assert(threadpool != NULL);
228 	assert(thread != NULL);
229 
230 	const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
231 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
232 
233 	/* Process thread's own range of items */
234 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
235 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j;
236 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
237 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i;
238 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j;
239 	size_t start_i = tile_index_i_j.quotient * tile_i;
240 	size_t start_j = tile_index_i_j.remainder * tile_j;
241 
242 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i;
243 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j;
244 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
245 		task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
246 		start_j += tile_j;
247 		if (start_j >= range_j) {
248 			start_j = 0;
249 			start_i += tile_i;
250 		}
251 	}
252 
253 	/* There still may be other threads with work */
254 	const size_t thread_number = thread->thread_number;
255 	const size_t threads_count = threadpool->threads_count.value;
256 	for (size_t tid = modulo_decrement(thread_number, threads_count);
257 		tid != thread_number;
258 		tid = modulo_decrement(tid, threads_count))
259 	{
260 		struct thread_info* other_thread = &threadpool->threads[tid];
261 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
262 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
263 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
264 			const size_t start_i = tile_index_i_j.quotient * tile_i;
265 			const size_t start_j = tile_index_i_j.remainder * tile_j;
266 			task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
267 		}
268 	}
269 
270 	/* Make changes by this thread visible to other threads */
271 	pthreadpool_fence_release();
272 }
273 
thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)274 static void thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
275 	assert(threadpool != NULL);
276 	assert(thread != NULL);
277 
278 	const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
279 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
280 
281 	const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index;
282 	uint32_t uarch_index = default_uarch_index;
283 	#if PTHREADPOOL_USE_CPUINFO
284 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
285 		if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) {
286 			uarch_index = default_uarch_index;
287 		}
288 	#endif
289 
290 	/* Process thread's own range of items */
291 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j;
292 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
293 	const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j);
294 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i;
295 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i;
296 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j;
297 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j;
298 	size_t start_i = index.quotient * tile_i;
299 	size_t start_j = index.remainder * tile_j;
300 
301 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
302 		task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
303 		start_j += tile_j;
304 		if (start_j >= range_j) {
305 			start_j = 0;
306 			start_i += tile_i;
307 		}
308 	}
309 
310 	/* There still may be other threads with work */
311 	const size_t thread_number = thread->thread_number;
312 	const size_t threads_count = threadpool->threads_count.value;
313 	for (size_t tid = modulo_decrement(thread_number, threads_count);
314 		tid != thread_number;
315 		tid = modulo_decrement(tid, threads_count))
316 	{
317 		struct thread_info* other_thread = &threadpool->threads[tid];
318 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
319 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
320 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
321 			const size_t start_i = tile_index_i_j.quotient * tile_i;
322 			const size_t start_j = tile_index_i_j.remainder * tile_j;
323 			task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
324 		}
325 	}
326 
327 	/* Make changes by this thread visible to other threads */
328 	pthreadpool_fence_release();
329 }
330 
thread_parallelize_3d(struct pthreadpool * threadpool,struct thread_info * thread)331 static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) {
332 	assert(threadpool != NULL);
333 	assert(thread != NULL);
334 
335 	const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
336 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
337 
338 	/* Process thread's own range of items */
339 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
340 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k;
341 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k);
342 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j;
343 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
344 	size_t i = index_i_j.quotient;
345 	size_t j = index_i_j.remainder;
346 	size_t k = index_ij_k.remainder;
347 
348 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
349 		task(argument, i, j, k);
350 		if (++k == range_k.value) {
351 			k = 0;
352 			if (++j == range_j.value) {
353 				j = 0;
354 				i += 1;
355 			}
356 		}
357 	}
358 
359 	/* There still may be other threads with work */
360 	const size_t thread_number = thread->thread_number;
361 	const size_t threads_count = threadpool->threads_count.value;
362 	for (size_t tid = modulo_decrement(thread_number, threads_count);
363 		tid != thread_number;
364 		tid = modulo_decrement(tid, threads_count))
365 	{
366 		struct thread_info* other_thread = &threadpool->threads[tid];
367 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
368 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
369 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k);
370 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
371 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder);
372 		}
373 	}
374 
375 	/* Make changes by this thread visible to other threads */
376 	pthreadpool_fence_release();
377 }
378 
thread_parallelize_3d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)379 static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
380 	assert(threadpool != NULL);
381 	assert(thread != NULL);
382 
383 	const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
384 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
385 
386 	/* Process thread's own range of items */
387 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
388 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k;
389 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
390 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j;
391 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
392 	const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k;
393 	size_t i = index_i_j.quotient;
394 	size_t j = index_i_j.remainder;
395 	size_t start_k = tile_index_ij_k.remainder * tile_k;
396 
397 	const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k;
398 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
399 		task(argument, i, j, start_k, min(range_k - start_k, tile_k));
400 		start_k += tile_k;
401 		if (start_k >= range_k) {
402 			start_k = 0;
403 			if (++j == range_j.value) {
404 				j = 0;
405 				i += 1;
406 			}
407 		}
408 	}
409 
410 	/* There still may be other threads with work */
411 	const size_t thread_number = thread->thread_number;
412 	const size_t threads_count = threadpool->threads_count.value;
413 	for (size_t tid = modulo_decrement(thread_number, threads_count);
414 		tid != thread_number;
415 		tid = modulo_decrement(tid, threads_count))
416 	{
417 		struct thread_info* other_thread = &threadpool->threads[tid];
418 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
419 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
420 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
421 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
422 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
423 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k));
424 		}
425 	}
426 
427 	/* Make changes by this thread visible to other threads */
428 	pthreadpool_fence_release();
429 }
430 
thread_parallelize_3d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)431 static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
432 	assert(threadpool != NULL);
433 	assert(thread != NULL);
434 
435 	const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
436 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
437 
438 	/* Process thread's own range of items */
439 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
440 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k;
441 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
442 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j;
443 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
444 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j;
445 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k;
446 	size_t i = tile_index_i_j.quotient;
447 	size_t start_j = tile_index_i_j.remainder * tile_j;
448 	size_t start_k = tile_index_ij_k.remainder * tile_k;
449 
450 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k;
451 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j;
452 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
453 		task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
454 		start_k += tile_k;
455 		if (start_k >= range_k) {
456 			start_k = 0;
457 			start_j += tile_j;
458 			if (start_j >= range_j) {
459 				start_j = 0;
460 				i += 1;
461 			}
462 		}
463 	}
464 
465 	/* There still may be other threads with work */
466 	const size_t thread_number = thread->thread_number;
467 	const size_t threads_count = threadpool->threads_count.value;
468 	for (size_t tid = modulo_decrement(thread_number, threads_count);
469 		tid != thread_number;
470 		tid = modulo_decrement(tid, threads_count))
471 	{
472 		struct thread_info* other_thread = &threadpool->threads[tid];
473 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
474 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
475 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
476 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
477 			const size_t start_j = tile_index_i_j.remainder * tile_j;
478 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
479 			task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
480 		}
481 	}
482 
483 	/* Make changes by this thread visible to other threads */
484 	pthreadpool_fence_release();
485 }
486 
thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)487 static void thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
488 	assert(threadpool != NULL);
489 	assert(thread != NULL);
490 
491 	const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
492 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
493 
494 	const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index;
495 	uint32_t uarch_index = default_uarch_index;
496 	#if PTHREADPOOL_USE_CPUINFO
497 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
498 		if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) {
499 			uarch_index = default_uarch_index;
500 		}
501 	#endif
502 
503 	/* Process thread's own range of items */
504 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
505 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k;
506 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
507 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j;
508 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
509 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j;
510 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k;
511 	size_t i = tile_index_i_j.quotient;
512 	size_t start_j = tile_index_i_j.remainder * tile_j;
513 	size_t start_k = tile_index_ij_k.remainder * tile_k;
514 
515 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k;
516 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j;
517 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
518 		task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
519 		start_k += tile_k;
520 		if (start_k >= range_k) {
521 			start_k = 0;
522 			start_j += tile_j;
523 			if (start_j >= range_j) {
524 				start_j = 0;
525 				i += 1;
526 			}
527 		}
528 	}
529 
530 	/* There still may be other threads with work */
531 	const size_t thread_number = thread->thread_number;
532 	const size_t threads_count = threadpool->threads_count.value;
533 	for (size_t tid = modulo_decrement(thread_number, threads_count);
534 		tid != thread_number;
535 		tid = modulo_decrement(tid, threads_count))
536 	{
537 		struct thread_info* other_thread = &threadpool->threads[tid];
538 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
539 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
540 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
541 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
542 			const size_t start_j = tile_index_i_j.remainder * tile_j;
543 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
544 			task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
545 		}
546 	}
547 
548 	/* Make changes by this thread visible to other threads */
549 	pthreadpool_fence_release();
550 }
551 
thread_parallelize_4d(struct pthreadpool * threadpool,struct thread_info * thread)552 static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) {
553 	assert(threadpool != NULL);
554 	assert(thread != NULL);
555 
556 	const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
557 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
558 
559 	/* Process thread's own range of items */
560 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
561 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl;
562 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl);
563 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j;
564 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
565 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l;
566 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
567 	size_t i = index_i_j.quotient;
568 	size_t j = index_i_j.remainder;
569 	size_t k = index_k_l.quotient;
570 	size_t l = index_k_l.remainder;
571 
572 	const size_t range_k = threadpool->params.parallelize_4d.range_k;
573 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
574 		task(argument, i, j, k, l);
575 		if (++l == range_l.value) {
576 			l = 0;
577 			if (++k == range_k) {
578 				k = 0;
579 				if (++j == range_j.value) {
580 					j = 0;
581 					i += 1;
582 				}
583 			}
584 		}
585 	}
586 
587 	/* There still may be other threads with work */
588 	const size_t thread_number = thread->thread_number;
589 	const size_t threads_count = threadpool->threads_count.value;
590 	for (size_t tid = modulo_decrement(thread_number, threads_count);
591 		tid != thread_number;
592 		tid = modulo_decrement(tid, threads_count))
593 	{
594 		struct thread_info* other_thread = &threadpool->threads[tid];
595 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
596 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
597 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl);
598 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
599 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
600 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder);
601 		}
602 	}
603 
604 	/* Make changes by this thread visible to other threads */
605 	pthreadpool_fence_release();
606 }
607 
thread_parallelize_4d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)608 static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
609 	assert(threadpool != NULL);
610 	assert(thread != NULL);
611 
612 	const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
613 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
614 
615 	/* Process thread's own range of items */
616 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
617 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl;
618 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
619 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j;
620 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
621 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l;
622 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
623 	const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l;
624 	size_t i = index_i_j.quotient;
625 	size_t j = index_i_j.remainder;
626 	size_t k = tile_index_k_l.quotient;
627 	size_t start_l = tile_index_k_l.remainder * tile_l;
628 
629 	const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k;
630 	const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l;
631 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
632 		task(argument, i, j, k, start_l, min(range_l - start_l, tile_l));
633 		start_l += tile_l;
634 		if (start_l >= range_l) {
635 			start_l = 0;
636 			if (++k == range_k) {
637 				k = 0;
638 				if (++j == range_j.value) {
639 					j = 0;
640 					i += 1;
641 				}
642 			}
643 		}
644 	}
645 
646 	/* There still may be other threads with work */
647 	const size_t thread_number = thread->thread_number;
648 	const size_t threads_count = threadpool->threads_count.value;
649 	for (size_t tid = modulo_decrement(thread_number, threads_count);
650 		tid != thread_number;
651 		tid = modulo_decrement(tid, threads_count))
652 	{
653 		struct thread_info* other_thread = &threadpool->threads[tid];
654 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
655 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
656 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
657 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
658 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
659 			const size_t start_l = tile_index_k_l.remainder * tile_l;
660 			task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l));
661 		}
662 	}
663 
664 	/* Make changes by this thread visible to other threads */
665 	pthreadpool_fence_release();
666 }
667 
thread_parallelize_4d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)668 static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
669 	assert(threadpool != NULL);
670 	assert(thread != NULL);
671 
672 	const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
673 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
674 
675 	/* Process thread's own range of items */
676 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
677 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl;
678 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
679 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j;
680 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
681 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l;
682 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
683 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k;
684 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l;
685 	size_t i = index_i_j.quotient;
686 	size_t j = index_i_j.remainder;
687 	size_t start_k = tile_index_k_l.quotient * tile_k;
688 	size_t start_l = tile_index_k_l.remainder * tile_l;
689 
690 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l;
691 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k;
692 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
693 		task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
694 		start_l += tile_l;
695 		if (start_l >= range_l) {
696 			start_l = 0;
697 			start_k += tile_k;
698 			if (start_k >= range_k) {
699 				start_k = 0;
700 				if (++j == range_j.value) {
701 					j = 0;
702 					i += 1;
703 				}
704 			}
705 		}
706 	}
707 
708 	/* There still may be other threads with work */
709 	const size_t thread_number = thread->thread_number;
710 	const size_t threads_count = threadpool->threads_count.value;
711 	for (size_t tid = modulo_decrement(thread_number, threads_count);
712 		tid != thread_number;
713 		tid = modulo_decrement(tid, threads_count))
714 	{
715 		struct thread_info* other_thread = &threadpool->threads[tid];
716 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
717 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
718 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
719 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
720 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
721 			const size_t start_k = tile_index_k_l.quotient * tile_k;
722 			const size_t start_l = tile_index_k_l.remainder * tile_l;
723 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
724 		}
725 	}
726 
727 	/* Make changes by this thread visible to other threads */
728 	pthreadpool_fence_release();
729 }
730 
thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)731 static void thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
732 	assert(threadpool != NULL);
733 	assert(thread != NULL);
734 
735 	const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
736 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
737 
738 	const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index;
739 	uint32_t uarch_index = default_uarch_index;
740 	#if PTHREADPOOL_USE_CPUINFO
741 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
742 		if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) {
743 			uarch_index = default_uarch_index;
744 		}
745 	#endif
746 
747 	/* Process thread's own range of items */
748 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
749 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl;
750 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
751 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j;
752 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
753 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l;
754 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
755 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k;
756 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l;
757 	size_t i = index_i_j.quotient;
758 	size_t j = index_i_j.remainder;
759 	size_t start_k = tile_index_k_l.quotient * tile_k;
760 	size_t start_l = tile_index_k_l.remainder * tile_l;
761 
762 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l;
763 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k;
764 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
765 		task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
766 		start_l += tile_l;
767 		if (start_l >= range_l) {
768 			start_l = 0;
769 			start_k += tile_k;
770 			if (start_k >= range_k) {
771 				start_k = 0;
772 				if (++j == range_j.value) {
773 					j = 0;
774 					i += 1;
775 				}
776 			}
777 		}
778 	}
779 
780 	/* There still may be other threads with work */
781 	const size_t thread_number = thread->thread_number;
782 	const size_t threads_count = threadpool->threads_count.value;
783 	for (size_t tid = modulo_decrement(thread_number, threads_count);
784 		tid != thread_number;
785 		tid = modulo_decrement(tid, threads_count))
786 	{
787 		struct thread_info* other_thread = &threadpool->threads[tid];
788 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
789 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
790 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
791 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
792 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
793 			const size_t start_k = tile_index_k_l.quotient * tile_k;
794 			const size_t start_l = tile_index_k_l.remainder * tile_l;
795 			task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
796 		}
797 	}
798 
799 	/* Make changes by this thread visible to other threads */
800 	pthreadpool_fence_release();
801 }
802 
thread_parallelize_5d(struct pthreadpool * threadpool,struct thread_info * thread)803 static void thread_parallelize_5d(struct pthreadpool* threadpool, struct thread_info* thread) {
804 	assert(threadpool != NULL);
805 	assert(thread != NULL);
806 
807 	const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
808 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
809 
810 	/* Process thread's own range of items */
811 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
812 	const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm;
813 	const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm);
814 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k;
815 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
816 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m;
817 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
818 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j;
819 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
820 	size_t i = index_i_j.quotient;
821 	size_t j = index_i_j.remainder;
822 	size_t k = index_ij_k.remainder;
823 	size_t l = index_l_m.quotient;
824 	size_t m = index_l_m.remainder;
825 
826 	const size_t range_l = threadpool->params.parallelize_5d.range_l;
827 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
828 		task(argument, i, j, k, l, m);
829 		if (++m == range_m.value) {
830 			m = 0;
831 			if (++l == range_l) {
832 				l = 0;
833 				if (++k == range_k.value) {
834 					k = 0;
835 					if (++j == range_j.value) {
836 						j = 0;
837 						i += 1;
838 					}
839 				}
840 			}
841 		}
842 	}
843 
844 	/* There still may be other threads with work */
845 	const size_t thread_number = thread->thread_number;
846 	const size_t threads_count = threadpool->threads_count.value;
847 	for (size_t tid = modulo_decrement(thread_number, threads_count);
848 		tid != thread_number;
849 		tid = modulo_decrement(tid, threads_count))
850 	{
851 		struct thread_info* other_thread = &threadpool->threads[tid];
852 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
853 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
854 			const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm);
855 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
856 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
857 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
858 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder);
859 		}
860 	}
861 
862 	/* Make changes by this thread visible to other threads */
863 	pthreadpool_fence_release();
864 }
865 
thread_parallelize_5d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)866 static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
867 	assert(threadpool != NULL);
868 	assert(thread != NULL);
869 
870 	const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
871 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
872 
873 	/* Process thread's own range of items */
874 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
875 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m;
876 	const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m);
877 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl;
878 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
879 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j;
880 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
881 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l;
882 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
883 	const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m;
884 	size_t i = index_i_j.quotient;
885 	size_t j = index_i_j.remainder;
886 	size_t k = index_k_l.quotient;
887 	size_t l = index_k_l.remainder;
888 	size_t start_m = tile_index_ijkl_m.remainder * tile_m;
889 
890 	const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m;
891 	const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k;
892 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
893 		task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m));
894 		start_m += tile_m;
895 		if (start_m >= range_m) {
896 			start_m = 0;
897 			if (++l == range_l.value) {
898 				l = 0;
899 				if (++k == range_k) {
900 					k = 0;
901 					if (++j == range_j.value) {
902 						j = 0;
903 						i += 1;
904 					}
905 				}
906 			}
907 		}
908 	}
909 
910 	/* There still may be other threads with work */
911 	const size_t thread_number = thread->thread_number;
912 	const size_t threads_count = threadpool->threads_count.value;
913 	for (size_t tid = modulo_decrement(thread_number, threads_count);
914 		tid != thread_number;
915 		tid = modulo_decrement(tid, threads_count))
916 	{
917 		struct thread_info* other_thread = &threadpool->threads[tid];
918 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
919 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
920 			const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m);
921 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
922 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
923 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
924 			size_t start_m = tile_index_ijkl_m.remainder * tile_m;
925 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m,
926 				min(range_m - start_m, tile_m));
927 		}
928 	}
929 
930 	/* Make changes by this thread visible to other threads */
931 	pthreadpool_fence_release();
932 }
933 
thread_parallelize_5d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)934 static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
935 	assert(threadpool != NULL);
936 	assert(thread != NULL);
937 
938 	const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
939 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
940 
941 	/* Process thread's own range of items */
942 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
943 	const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm;
944 	const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm);
945 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k;
946 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
947 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m;
948 	const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
949 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j;
950 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
951 	const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l;
952 	const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m;
953 	size_t i = index_i_j.quotient;
954 	size_t j = index_i_j.remainder;
955 	size_t k = index_ij_k.remainder;
956 	size_t start_l = tile_index_l_m.quotient * tile_l;
957 	size_t start_m = tile_index_l_m.remainder * tile_m;
958 
959 	const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m;
960 	const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l;
961 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
962 		task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
963 		start_m += tile_m;
964 		if (start_m >= range_m) {
965 			start_m = 0;
966 			start_l += tile_l;
967 			if (start_l >= range_l) {
968 				start_l = 0;
969 				if (++k == range_k.value) {
970 					k = 0;
971 					if (++j == range_j.value) {
972 						j = 0;
973 						i += 1;
974 					}
975 				}
976 			}
977 		}
978 	}
979 
980 	/* There still may be other threads with work */
981 	const size_t thread_number = thread->thread_number;
982 	const size_t threads_count = threadpool->threads_count.value;
983 	for (size_t tid = modulo_decrement(thread_number, threads_count);
984 		tid != thread_number;
985 		tid = modulo_decrement(tid, threads_count))
986 	{
987 		struct thread_info* other_thread = &threadpool->threads[tid];
988 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
989 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
990 			const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm);
991 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
992 			const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
993 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
994 			const size_t start_l = tile_index_l_m.quotient * tile_l;
995 			const size_t start_m = tile_index_l_m.remainder * tile_m;
996 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder,
997 				start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
998 		}
999 	}
1000 
1001 	/* Make changes by this thread visible to other threads */
1002 	pthreadpool_fence_release();
1003 }
1004 
thread_parallelize_6d(struct pthreadpool * threadpool,struct thread_info * thread)1005 static void thread_parallelize_6d(struct pthreadpool* threadpool, struct thread_info* thread) {
1006 	assert(threadpool != NULL);
1007 	assert(thread != NULL);
1008 
1009 	const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1010 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1011 
1012 	/* Process thread's own range of items */
1013 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1014 	const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn;
1015 	const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn);
1016 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k;
1017 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1018 	const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n;
1019 	const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1020 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j;
1021 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1022 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m;
1023 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1024 	size_t i = index_i_j.quotient;
1025 	size_t j = index_i_j.remainder;
1026 	size_t k = index_ij_k.remainder;
1027 	size_t l = index_l_m.quotient;
1028 	size_t m = index_l_m.remainder;
1029 	size_t n = index_lm_n.remainder;
1030 
1031 	const size_t range_l = threadpool->params.parallelize_6d.range_l;
1032 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1033 		task(argument, i, j, k, l, m, n);
1034 		if (++n == range_n.value) {
1035 			n = 0;
1036 			if (++m == range_m.value) {
1037 				m = 0;
1038 				if (++l == range_l) {
1039 					l = 0;
1040 					if (++k == range_k.value) {
1041 						k = 0;
1042 						if (++j == range_j.value) {
1043 							j = 0;
1044 							i += 1;
1045 						}
1046 					}
1047 				}
1048 			}
1049 		}
1050 	}
1051 
1052 
1053 	/* There still may be other threads with work */
1054 	const size_t thread_number = thread->thread_number;
1055 	const size_t threads_count = threadpool->threads_count.value;
1056 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1057 		tid != thread_number;
1058 		tid = modulo_decrement(tid, threads_count))
1059 	{
1060 		struct thread_info* other_thread = &threadpool->threads[tid];
1061 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1062 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1063 			const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn);
1064 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1065 			const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1066 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1067 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1068 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder);
1069 		}
1070 	}
1071 
1072 	/* Make changes by this thread visible to other threads */
1073 	pthreadpool_fence_release();
1074 }
1075 
thread_parallelize_6d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)1076 static void thread_parallelize_6d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
1077 	assert(threadpool != NULL);
1078 	assert(thread != NULL);
1079 
1080 	const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1081 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1082 
1083 	/* Process thread's own range of items */
1084 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1085 	const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn;
1086 	const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn);
1087 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k;
1088 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1089 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n;
1090 	const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1091 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j;
1092 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1093 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m;
1094 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1095 	const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n;
1096 	size_t i = index_i_j.quotient;
1097 	size_t j = index_i_j.remainder;
1098 	size_t k = index_ij_k.remainder;
1099 	size_t l = index_l_m.quotient;
1100 	size_t m = index_l_m.remainder;
1101 	size_t start_n = tile_index_lm_n.remainder * tile_n;
1102 
1103 	const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n;
1104 	const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l;
1105 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1106 		task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n));
1107 		start_n += tile_n;
1108 		if (start_n >= range_n) {
1109 			start_n = 0;
1110 			if (++m == range_m.value) {
1111 				m = 0;
1112 				if (++l == range_l) {
1113 					l = 0;
1114 					if (++k == range_k.value) {
1115 						k = 0;
1116 						if (++j == range_j.value) {
1117 							j = 0;
1118 							i += 1;
1119 						}
1120 					}
1121 				}
1122 			}
1123 		}
1124 	}
1125 
1126 
1127 	/* There still may be other threads with work */
1128 	const size_t thread_number = thread->thread_number;
1129 	const size_t threads_count = threadpool->threads_count.value;
1130 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1131 		tid != thread_number;
1132 		tid = modulo_decrement(tid, threads_count))
1133 	{
1134 		struct thread_info* other_thread = &threadpool->threads[tid];
1135 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1136 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1137 			const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn);
1138 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1139 			const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1140 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1141 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1142 			const size_t start_n = tile_index_lm_n.remainder * tile_n;
1143 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder,
1144 				start_n, min(range_n - start_n, tile_n));
1145 		}
1146 	}
1147 
1148 	/* Make changes by this thread visible to other threads */
1149 	pthreadpool_fence_release();
1150 }
1151 
thread_parallelize_6d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)1152 static void thread_parallelize_6d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
1153 	assert(threadpool != NULL);
1154 	assert(thread != NULL);
1155 
1156 	const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1157 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1158 
1159 	/* Process thread's own range of items */
1160 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1161 	const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn;
1162 	const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn);
1163 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl;
1164 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1165 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n;
1166 	const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1167 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j;
1168 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1169 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l;
1170 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1171 	const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m;
1172 	const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n;
1173 	size_t i = index_i_j.quotient;
1174 	size_t j = index_i_j.remainder;
1175 	size_t k = index_k_l.quotient;
1176 	size_t l = index_k_l.remainder;
1177 	size_t start_m = tile_index_m_n.quotient * tile_m;
1178 	size_t start_n = tile_index_m_n.remainder * tile_n;
1179 
1180 	const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n;
1181 	const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m;
1182 	const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k;
1183 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1184 		task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1185 		start_n += tile_n;
1186 		if (start_n >= range_n) {
1187 			start_n = 0;
1188 			start_m += tile_m;
1189 			if (start_m >= range_m) {
1190 				start_m = 0;
1191 				if (++l == range_l.value) {
1192 					l = 0;
1193 					if (++k == range_k) {
1194 						k = 0;
1195 						if (++j == range_j.value) {
1196 							j = 0;
1197 							i += 1;
1198 						}
1199 					}
1200 				}
1201 			}
1202 		}
1203 	}
1204 
1205 	/* There still may be other threads with work */
1206 	const size_t thread_number = thread->thread_number;
1207 	const size_t threads_count = threadpool->threads_count.value;
1208 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1209 		tid != thread_number;
1210 		tid = modulo_decrement(tid, threads_count))
1211 	{
1212 		struct thread_info* other_thread = &threadpool->threads[tid];
1213 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1214 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1215 			const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn);
1216 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1217 			const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1218 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1219 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1220 			const size_t start_m = tile_index_m_n.quotient * tile_m;
1221 			const size_t start_n = tile_index_m_n.remainder * tile_n;
1222 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder,
1223 				start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1224 		}
1225 	}
1226 
1227 	/* Make changes by this thread visible to other threads */
1228 	pthreadpool_fence_release();
1229 }
1230 
pthreadpool_parallelize_1d(struct pthreadpool * threadpool,pthreadpool_task_1d_t task,void * argument,size_t range,uint32_t flags)1231 void pthreadpool_parallelize_1d(
1232 	struct pthreadpool* threadpool,
1233 	pthreadpool_task_1d_t task,
1234 	void* argument,
1235 	size_t range,
1236 	uint32_t flags)
1237 {
1238 	size_t threads_count;
1239 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) {
1240 		/* No thread pool used: execute task sequentially on the calling thread */
1241 		struct fpu_state saved_fpu_state = { 0 };
1242 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1243 			saved_fpu_state = get_fpu_state();
1244 			disable_fpu_denormals();
1245 		}
1246 		for (size_t i = 0; i < range; i++) {
1247 			task(argument, i);
1248 		}
1249 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1250 			set_fpu_state(saved_fpu_state);
1251 		}
1252 	} else {
1253 		thread_function_t parallelize_1d = &thread_parallelize_1d;
1254 		#if PTHREADPOOL_USE_FASTPATH
1255 			const size_t range_threshold = -threads_count;
1256 			if (range < range_threshold) {
1257 				parallelize_1d = &pthreadpool_thread_parallelize_1d_fastpath;
1258 			}
1259 		#endif
1260 		pthreadpool_parallelize(
1261 			threadpool, parallelize_1d, NULL, 0,
1262 			(void*) task, argument, range, flags);
1263 	}
1264 }
1265 
pthreadpool_parallelize_1d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_1d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range,uint32_t flags)1266 void pthreadpool_parallelize_1d_with_uarch(
1267 	pthreadpool_t threadpool,
1268 	pthreadpool_task_1d_with_id_t task,
1269 	void* argument,
1270 	uint32_t default_uarch_index,
1271 	uint32_t max_uarch_index,
1272 	size_t range,
1273 	uint32_t flags)
1274 {
1275 	size_t threads_count;
1276 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) {
1277 		/* No thread pool used: execute task sequentially on the calling thread */
1278 
1279 		uint32_t uarch_index = default_uarch_index;
1280 		#if PTHREADPOOL_USE_CPUINFO
1281 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1282 			if (uarch_index > max_uarch_index) {
1283 				uarch_index = default_uarch_index;
1284 			}
1285 		#endif
1286 
1287 		struct fpu_state saved_fpu_state = { 0 };
1288 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1289 			saved_fpu_state = get_fpu_state();
1290 			disable_fpu_denormals();
1291 		}
1292 		for (size_t i = 0; i < range; i++) {
1293 			task(argument, uarch_index, i);
1294 		}
1295 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1296 			set_fpu_state(saved_fpu_state);
1297 		}
1298 	} else {
1299 		const struct pthreadpool_1d_with_uarch_params params = {
1300 			.default_uarch_index = default_uarch_index,
1301 			.max_uarch_index = max_uarch_index,
1302 		};
1303 		thread_function_t parallelize_1d_with_uarch = &thread_parallelize_1d_with_uarch;
1304 		#if PTHREADPOOL_USE_FASTPATH
1305 			const size_t range_threshold = -threads_count;
1306 			if (range < range_threshold) {
1307 				parallelize_1d_with_uarch = &pthreadpool_thread_parallelize_1d_with_uarch_fastpath;
1308 			}
1309 		#endif
1310 		pthreadpool_parallelize(
1311 			threadpool, parallelize_1d_with_uarch, &params, sizeof(params),
1312 			task, argument, range, flags);
1313 	}
1314 }
1315 
pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_1d_tile_1d_t task,void * argument,size_t range,size_t tile,uint32_t flags)1316 void pthreadpool_parallelize_1d_tile_1d(
1317 	pthreadpool_t threadpool,
1318 	pthreadpool_task_1d_tile_1d_t task,
1319 	void* argument,
1320 	size_t range,
1321 	size_t tile,
1322 	uint32_t flags)
1323 {
1324 	size_t threads_count;
1325 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) {
1326 		/* No thread pool used: execute task sequentially on the calling thread */
1327 		struct fpu_state saved_fpu_state = { 0 };
1328 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1329 			saved_fpu_state = get_fpu_state();
1330 			disable_fpu_denormals();
1331 		}
1332 		for (size_t i = 0; i < range; i += tile) {
1333 			task(argument, i, min(range - i, tile));
1334 		}
1335 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1336 			set_fpu_state(saved_fpu_state);
1337 		}
1338 	} else {
1339 		const size_t tile_range = divide_round_up(range, tile);
1340 		const struct pthreadpool_1d_tile_1d_params params = {
1341 			.range = range,
1342 			.tile = tile,
1343 		};
1344 		thread_function_t parallelize_1d_tile_1d = &thread_parallelize_1d_tile_1d;
1345 		#if PTHREADPOOL_USE_FASTPATH
1346 			const size_t range_threshold = -threads_count;
1347 			if (range < range_threshold) {
1348 				parallelize_1d_tile_1d = &pthreadpool_thread_parallelize_1d_tile_1d_fastpath;
1349 			}
1350 		#endif
1351 		pthreadpool_parallelize(
1352 			threadpool, parallelize_1d_tile_1d, &params, sizeof(params),
1353 			task, argument, tile_range, flags);
1354 	}
1355 }
1356 
pthreadpool_parallelize_2d(pthreadpool_t threadpool,pthreadpool_task_2d_t task,void * argument,size_t range_i,size_t range_j,uint32_t flags)1357 void pthreadpool_parallelize_2d(
1358 	pthreadpool_t threadpool,
1359 	pthreadpool_task_2d_t task,
1360 	void* argument,
1361 	size_t range_i,
1362 	size_t range_j,
1363 	uint32_t flags)
1364 {
1365 	size_t threads_count;
1366 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j) <= 1) {
1367 		/* No thread pool used: execute task sequentially on the calling thread */
1368 		struct fpu_state saved_fpu_state = { 0 };
1369 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1370 			saved_fpu_state = get_fpu_state();
1371 			disable_fpu_denormals();
1372 		}
1373 		for (size_t i = 0; i < range_i; i++) {
1374 			for (size_t j = 0; j < range_j; j++) {
1375 				task(argument, i, j);
1376 			}
1377 		}
1378 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1379 			set_fpu_state(saved_fpu_state);
1380 		}
1381 	} else {
1382 		const size_t range = range_i * range_j;
1383 		const struct pthreadpool_2d_params params = {
1384 			.range_j = fxdiv_init_size_t(range_j),
1385 		};
1386 		thread_function_t parallelize_2d = &thread_parallelize_2d;
1387 		#if PTHREADPOOL_USE_FASTPATH
1388 			const size_t range_threshold = -threads_count;
1389 			if (range < range_threshold) {
1390 				parallelize_2d = &pthreadpool_thread_parallelize_2d_fastpath;
1391 			}
1392 		#endif
1393 		pthreadpool_parallelize(
1394 			threadpool, parallelize_2d, &params, sizeof(params),
1395 			task, argument, range, flags);
1396 	}
1397 }
1398 
pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_2d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t tile_j,uint32_t flags)1399 void pthreadpool_parallelize_2d_tile_1d(
1400 	pthreadpool_t threadpool,
1401 	pthreadpool_task_2d_tile_1d_t task,
1402 	void* argument,
1403 	size_t range_i,
1404 	size_t range_j,
1405 	size_t tile_j,
1406 	uint32_t flags)
1407 {
1408 	size_t threads_count;
1409 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j)) {
1410 		/* No thread pool used: execute task sequentially on the calling thread */
1411 		struct fpu_state saved_fpu_state = { 0 };
1412 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1413 			saved_fpu_state = get_fpu_state();
1414 			disable_fpu_denormals();
1415 		}
1416 		for (size_t i = 0; i < range_i; i++) {
1417 			for (size_t j = 0; j < range_j; j += tile_j) {
1418 				task(argument, i, j, min(range_j - j, tile_j));
1419 			}
1420 		}
1421 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1422 			set_fpu_state(saved_fpu_state);
1423 		}
1424 	} else {
1425 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1426 		const size_t tile_range = range_i * tile_range_j;
1427 		const struct pthreadpool_2d_tile_1d_params params = {
1428 			.range_j = range_j,
1429 			.tile_j = tile_j,
1430 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1431 		};
1432 		thread_function_t parallelize_2d_tile_1d = &thread_parallelize_2d_tile_1d;
1433 		#if PTHREADPOOL_USE_FASTPATH
1434 			const size_t range_threshold = -threads_count;
1435 			if (tile_range < range_threshold) {
1436 				parallelize_2d_tile_1d = &pthreadpool_thread_parallelize_2d_tile_1d_fastpath;
1437 			}
1438 		#endif
1439 		pthreadpool_parallelize(
1440 			threadpool, parallelize_2d_tile_1d, &params, sizeof(params),
1441 			task, argument, tile_range, flags);
1442 	}
1443 }
1444 
pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_2d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j,uint32_t flags)1445 void pthreadpool_parallelize_2d_tile_2d(
1446 	pthreadpool_t threadpool,
1447 	pthreadpool_task_2d_tile_2d_t task,
1448 	void* argument,
1449 	size_t range_i,
1450 	size_t range_j,
1451 	size_t tile_i,
1452 	size_t tile_j,
1453 	uint32_t flags)
1454 {
1455 	size_t threads_count;
1456 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) {
1457 		/* No thread pool used: execute task sequentially on the calling thread */
1458 		struct fpu_state saved_fpu_state = { 0 };
1459 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1460 			saved_fpu_state = get_fpu_state();
1461 			disable_fpu_denormals();
1462 		}
1463 		for (size_t i = 0; i < range_i; i += tile_i) {
1464 			for (size_t j = 0; j < range_j; j += tile_j) {
1465 				task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1466 			}
1467 		}
1468 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1469 			set_fpu_state(saved_fpu_state);
1470 		}
1471 	} else {
1472 		const size_t tile_range_i = divide_round_up(range_i, tile_i);
1473 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1474 		const size_t tile_range = tile_range_i * tile_range_j;
1475 		const struct pthreadpool_2d_tile_2d_params params = {
1476 			.range_i = range_i,
1477 			.tile_i = tile_i,
1478 			.range_j = range_j,
1479 			.tile_j = tile_j,
1480 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1481 		};
1482 		thread_function_t parallelize_2d_tile_2d = &thread_parallelize_2d_tile_2d;
1483 		#if PTHREADPOOL_USE_FASTPATH
1484 			const size_t range_threshold = -threads_count;
1485 			if (tile_range < range_threshold) {
1486 				parallelize_2d_tile_2d = &pthreadpool_thread_parallelize_2d_tile_2d_fastpath;
1487 			}
1488 		#endif
1489 		pthreadpool_parallelize(
1490 			threadpool, parallelize_2d_tile_2d, &params, sizeof(params),
1491 			task, argument, tile_range, flags);
1492 	}
1493 }
1494 
pthreadpool_parallelize_2d_tile_2d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_2d_tile_2d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j,uint32_t flags)1495 void pthreadpool_parallelize_2d_tile_2d_with_uarch(
1496 	pthreadpool_t threadpool,
1497 	pthreadpool_task_2d_tile_2d_with_id_t task,
1498 	void* argument,
1499 	uint32_t default_uarch_index,
1500 	uint32_t max_uarch_index,
1501 	size_t range_i,
1502 	size_t range_j,
1503 	size_t tile_i,
1504 	size_t tile_j,
1505 	uint32_t flags)
1506 {
1507 	size_t threads_count;
1508 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) {
1509 		/* No thread pool used: execute task sequentially on the calling thread */
1510 
1511 		uint32_t uarch_index = default_uarch_index;
1512 		#if PTHREADPOOL_USE_CPUINFO
1513 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1514 			if (uarch_index > max_uarch_index) {
1515 				uarch_index = default_uarch_index;
1516 			}
1517 		#endif
1518 
1519 		struct fpu_state saved_fpu_state = { 0 };
1520 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1521 			saved_fpu_state = get_fpu_state();
1522 			disable_fpu_denormals();
1523 		}
1524 		for (size_t i = 0; i < range_i; i += tile_i) {
1525 			for (size_t j = 0; j < range_j; j += tile_j) {
1526 				task(argument, uarch_index, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1527 			}
1528 		}
1529 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1530 			set_fpu_state(saved_fpu_state);
1531 		}
1532 	} else {
1533 		const size_t tile_range_i = divide_round_up(range_i, tile_i);
1534 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1535 		const size_t tile_range = tile_range_i * tile_range_j;
1536 		const struct pthreadpool_2d_tile_2d_with_uarch_params params = {
1537 			.default_uarch_index = default_uarch_index,
1538 			.max_uarch_index = max_uarch_index,
1539 			.range_i = range_i,
1540 			.tile_i = tile_i,
1541 			.range_j = range_j,
1542 			.tile_j = tile_j,
1543 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1544 		};
1545 		thread_function_t parallelize_2d_tile_2d_with_uarch = &thread_parallelize_2d_tile_2d_with_uarch;
1546 		#if PTHREADPOOL_USE_FASTPATH
1547 			const size_t range_threshold = -threads_count;
1548 			if (tile_range < range_threshold) {
1549 				parallelize_2d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath;
1550 			}
1551 		#endif
1552 		pthreadpool_parallelize(
1553 			threadpool, parallelize_2d_tile_2d_with_uarch, &params, sizeof(params),
1554 			task, argument, tile_range, flags);
1555 	}
1556 }
1557 
pthreadpool_parallelize_3d(pthreadpool_t threadpool,pthreadpool_task_3d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,uint32_t flags)1558 void pthreadpool_parallelize_3d(
1559 	pthreadpool_t threadpool,
1560 	pthreadpool_task_3d_t task,
1561 	void* argument,
1562 	size_t range_i,
1563 	size_t range_j,
1564 	size_t range_k,
1565 	uint32_t flags)
1566 {
1567 	size_t threads_count;
1568 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k) <= 1) {
1569 		/* No thread pool used: execute task sequentially on the calling thread */
1570 		struct fpu_state saved_fpu_state = { 0 };
1571 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1572 			saved_fpu_state = get_fpu_state();
1573 			disable_fpu_denormals();
1574 		}
1575 		for (size_t i = 0; i < range_i; i++) {
1576 			for (size_t j = 0; j < range_j; j++) {
1577 				for (size_t k = 0; k < range_k; k++) {
1578 					task(argument, i, j, k);
1579 				}
1580 			}
1581 		}
1582 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1583 			set_fpu_state(saved_fpu_state);
1584 		}
1585 	} else {
1586 		const size_t range = range_i * range_j * range_k;
1587 		const struct pthreadpool_3d_params params = {
1588 			.range_j = fxdiv_init_size_t(range_j),
1589 			.range_k = fxdiv_init_size_t(range_k),
1590 		};
1591 		thread_function_t parallelize_3d = &thread_parallelize_3d;
1592 		#if PTHREADPOOL_USE_FASTPATH
1593 			const size_t range_threshold = -threads_count;
1594 			if (range < range_threshold) {
1595 				parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath;
1596 			}
1597 		#endif
1598 		pthreadpool_parallelize(
1599 			threadpool, parallelize_3d, &params, sizeof(params),
1600 			task, argument, range, flags);
1601 	}
1602 }
1603 
pthreadpool_parallelize_3d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_3d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_k,uint32_t flags)1604 void pthreadpool_parallelize_3d_tile_1d(
1605 	pthreadpool_t threadpool,
1606 	pthreadpool_task_3d_tile_1d_t task,
1607 	void* argument,
1608 	size_t range_i,
1609 	size_t range_j,
1610 	size_t range_k,
1611 	size_t tile_k,
1612 	uint32_t flags)
1613 {
1614 	size_t threads_count;
1615 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) {
1616 		/* No thread pool used: execute task sequentially on the calling thread */
1617 		struct fpu_state saved_fpu_state = { 0 };
1618 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1619 			saved_fpu_state = get_fpu_state();
1620 			disable_fpu_denormals();
1621 		}
1622 		for (size_t i = 0; i < range_i; i++) {
1623 			for (size_t j = 0; j < range_j; j++) {
1624 				for (size_t k = 0; k < range_k; k += tile_k) {
1625 					task(argument, i, j, k, min(range_k - k, tile_k));
1626 				}
1627 			}
1628 		}
1629 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1630 			set_fpu_state(saved_fpu_state);
1631 		}
1632 	} else {
1633 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
1634 		const size_t tile_range = range_i * range_j * tile_range_k;
1635 		const struct pthreadpool_3d_tile_1d_params params = {
1636 			.range_k = range_k,
1637 			.tile_k = tile_k,
1638 			.range_j = fxdiv_init_size_t(range_j),
1639 			.tile_range_k = fxdiv_init_size_t(tile_range_k),
1640 		};
1641 		thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d;
1642 		#if PTHREADPOOL_USE_FASTPATH
1643 			const size_t range_threshold = -threads_count;
1644 			if (tile_range < range_threshold) {
1645 				parallelize_3d_tile_1d = &pthreadpool_thread_parallelize_3d_tile_1d_fastpath;
1646 			}
1647 		#endif
1648 		pthreadpool_parallelize(
1649 			threadpool, parallelize_3d_tile_1d, &params, sizeof(params),
1650 			task, argument, tile_range, flags);
1651 	}
1652 }
1653 
pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_3d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_j,size_t tile_k,uint32_t flags)1654 void pthreadpool_parallelize_3d_tile_2d(
1655 	pthreadpool_t threadpool,
1656 	pthreadpool_task_3d_tile_2d_t task,
1657 	void* argument,
1658 	size_t range_i,
1659 	size_t range_j,
1660 	size_t range_k,
1661 	size_t tile_j,
1662 	size_t tile_k,
1663 	uint32_t flags)
1664 {
1665 	size_t threads_count;
1666 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) {
1667 		/* No thread pool used: execute task sequentially on the calling thread */
1668 		struct fpu_state saved_fpu_state = { 0 };
1669 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1670 			saved_fpu_state = get_fpu_state();
1671 			disable_fpu_denormals();
1672 		}
1673 		for (size_t i = 0; i < range_i; i++) {
1674 			for (size_t j = 0; j < range_j; j += tile_j) {
1675 				for (size_t k = 0; k < range_k; k += tile_k) {
1676 					task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1677 				}
1678 			}
1679 		}
1680 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1681 			set_fpu_state(saved_fpu_state);
1682 		}
1683 	} else {
1684 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1685 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
1686 		const size_t tile_range = range_i * tile_range_j * tile_range_k;
1687 		const struct pthreadpool_3d_tile_2d_params params = {
1688 			.range_j = range_j,
1689 			.tile_j = tile_j,
1690 			.range_k = range_k,
1691 			.tile_k = tile_k,
1692 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1693 			.tile_range_k = fxdiv_init_size_t(tile_range_k),
1694 		};
1695 		thread_function_t parallelize_3d_tile_2d = &thread_parallelize_3d_tile_2d;
1696 		#if PTHREADPOOL_USE_FASTPATH
1697 			const size_t range_threshold = -threads_count;
1698 			if (tile_range < range_threshold) {
1699 				parallelize_3d_tile_2d = &pthreadpool_thread_parallelize_3d_tile_2d_fastpath;
1700 			}
1701 		#endif
1702 		pthreadpool_parallelize(
1703 			threadpool, parallelize_3d_tile_2d, &params, sizeof(params),
1704 			task, argument, tile_range, flags);
1705 	}
1706 }
1707 
pthreadpool_parallelize_3d_tile_2d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_3d_tile_2d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range_i,size_t range_j,size_t range_k,size_t tile_j,size_t tile_k,uint32_t flags)1708 void pthreadpool_parallelize_3d_tile_2d_with_uarch(
1709 	pthreadpool_t threadpool,
1710 	pthreadpool_task_3d_tile_2d_with_id_t task,
1711 	void* argument,
1712 	uint32_t default_uarch_index,
1713 	uint32_t max_uarch_index,
1714 	size_t range_i,
1715 	size_t range_j,
1716 	size_t range_k,
1717 	size_t tile_j,
1718 	size_t tile_k,
1719 	uint32_t flags)
1720 {
1721 	size_t threads_count;
1722 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) {
1723 		/* No thread pool used: execute task sequentially on the calling thread */
1724 
1725 		uint32_t uarch_index = default_uarch_index;
1726 		#if PTHREADPOOL_USE_CPUINFO
1727 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1728 			if (uarch_index > max_uarch_index) {
1729 				uarch_index = default_uarch_index;
1730 			}
1731 		#endif
1732 
1733 		struct fpu_state saved_fpu_state = { 0 };
1734 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1735 			saved_fpu_state = get_fpu_state();
1736 			disable_fpu_denormals();
1737 		}
1738 		for (size_t i = 0; i < range_i; i++) {
1739 			for (size_t j = 0; j < range_j; j += tile_j) {
1740 				for (size_t k = 0; k < range_k; k += tile_k) {
1741 					task(argument, uarch_index, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1742 				}
1743 			}
1744 		}
1745 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1746 			set_fpu_state(saved_fpu_state);
1747 		}
1748 	} else {
1749 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1750 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
1751 		const size_t tile_range = range_i * tile_range_j * tile_range_k;
1752 		const struct pthreadpool_3d_tile_2d_with_uarch_params params = {
1753 			.default_uarch_index = default_uarch_index,
1754 			.max_uarch_index = max_uarch_index,
1755 			.range_j = range_j,
1756 			.tile_j = tile_j,
1757 			.range_k = range_k,
1758 			.tile_k = tile_k,
1759 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1760 			.tile_range_k = fxdiv_init_size_t(tile_range_k),
1761 		};
1762 		thread_function_t parallelize_3d_tile_2d_with_uarch = &thread_parallelize_3d_tile_2d_with_uarch;
1763 		#if PTHREADPOOL_USE_FASTPATH
1764 			const size_t range_threshold = -threads_count;
1765 			if (tile_range < range_threshold) {
1766 				parallelize_3d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath;
1767 			}
1768 		#endif
1769 		pthreadpool_parallelize(
1770 			threadpool, parallelize_3d_tile_2d_with_uarch, &params, sizeof(params),
1771 			task, argument, tile_range, flags);
1772 	}
1773 }
1774 
pthreadpool_parallelize_4d(pthreadpool_t threadpool,pthreadpool_task_4d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,uint32_t flags)1775 void pthreadpool_parallelize_4d(
1776 	pthreadpool_t threadpool,
1777 	pthreadpool_task_4d_t task,
1778 	void* argument,
1779 	size_t range_i,
1780 	size_t range_j,
1781 	size_t range_k,
1782 	size_t range_l,
1783 	uint32_t flags)
1784 {
1785 	size_t threads_count;
1786 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l) <= 1) {
1787 		/* No thread pool used: execute task sequentially on the calling thread */
1788 		struct fpu_state saved_fpu_state = { 0 };
1789 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1790 			saved_fpu_state = get_fpu_state();
1791 			disable_fpu_denormals();
1792 		}
1793 		for (size_t i = 0; i < range_i; i++) {
1794 			for (size_t j = 0; j < range_j; j++) {
1795 				for (size_t k = 0; k < range_k; k++) {
1796 					for (size_t l = 0; l < range_l; l++) {
1797 						task(argument, i, j, k, l);
1798 					}
1799 				}
1800 			}
1801 		}
1802 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1803 			set_fpu_state(saved_fpu_state);
1804 		}
1805 	} else {
1806 		const size_t range_kl = range_k * range_l;
1807 		const size_t range = range_i * range_j * range_kl;
1808 		const struct pthreadpool_4d_params params = {
1809 			.range_k = range_k,
1810 			.range_j = fxdiv_init_size_t(range_j),
1811 			.range_kl = fxdiv_init_size_t(range_kl),
1812 			.range_l = fxdiv_init_size_t(range_l),
1813 		};
1814 		thread_function_t parallelize_4d = &thread_parallelize_4d;
1815 		#if PTHREADPOOL_USE_FASTPATH
1816 			const size_t range_threshold = -threads_count;
1817 			if (range < range_threshold) {
1818 				parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath;
1819 			}
1820 		#endif
1821 		pthreadpool_parallelize(
1822 			threadpool, parallelize_4d, &params, sizeof(params),
1823 			task, argument, range, flags);
1824 	}
1825 }
1826 
pthreadpool_parallelize_4d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_4d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_l,uint32_t flags)1827 void pthreadpool_parallelize_4d_tile_1d(
1828 	pthreadpool_t threadpool,
1829 	pthreadpool_task_4d_tile_1d_t task,
1830 	void* argument,
1831 	size_t range_i,
1832 	size_t range_j,
1833 	size_t range_k,
1834 	size_t range_l,
1835 	size_t tile_l,
1836 	uint32_t flags)
1837 {
1838 	size_t threads_count;
1839 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l)) {
1840 		/* No thread pool used: execute task sequentially on the calling thread */
1841 		struct fpu_state saved_fpu_state = { 0 };
1842 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1843 			saved_fpu_state = get_fpu_state();
1844 			disable_fpu_denormals();
1845 		}
1846 		for (size_t i = 0; i < range_i; i++) {
1847 			for (size_t j = 0; j < range_j; j++) {
1848 				for (size_t k = 0; k < range_k; k++) {
1849 					for (size_t l = 0; l < range_l; l += tile_l) {
1850 						task(argument, i, j, k, l, min(range_l - l, tile_l));
1851 					}
1852 				}
1853 			}
1854 		}
1855 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1856 			set_fpu_state(saved_fpu_state);
1857 		}
1858 	} else {
1859 		const size_t tile_range_l = divide_round_up(range_l, tile_l);
1860 		const size_t tile_range_kl = range_k * tile_range_l;
1861 		const size_t tile_range = range_i * range_j * tile_range_kl;
1862 		const struct pthreadpool_4d_tile_1d_params params = {
1863 			.range_k = range_k,
1864 			.range_l = range_l,
1865 			.tile_l = tile_l,
1866 			.range_j = fxdiv_init_size_t(range_j),
1867 			.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1868 			.tile_range_l = fxdiv_init_size_t(tile_range_l),
1869 		};
1870 		thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d;
1871 		#if PTHREADPOOL_USE_FASTPATH
1872 			const size_t range_threshold = -threads_count;
1873 			if (tile_range < range_threshold) {
1874 				parallelize_4d_tile_1d = &pthreadpool_thread_parallelize_4d_tile_1d_fastpath;
1875 			}
1876 		#endif
1877 		pthreadpool_parallelize(
1878 			threadpool, parallelize_4d_tile_1d, &params, sizeof(params),
1879 			task, argument, tile_range, flags);
1880 	}
1881 }
1882 
pthreadpool_parallelize_4d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_4d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_k,size_t tile_l,uint32_t flags)1883 void pthreadpool_parallelize_4d_tile_2d(
1884 	pthreadpool_t threadpool,
1885 	pthreadpool_task_4d_tile_2d_t task,
1886 	void* argument,
1887 	size_t range_i,
1888 	size_t range_j,
1889 	size_t range_k,
1890 	size_t range_l,
1891 	size_t tile_k,
1892 	size_t tile_l,
1893 	uint32_t flags)
1894 {
1895 	size_t threads_count;
1896 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) {
1897 		/* No thread pool used: execute task sequentially on the calling thread */
1898 		struct fpu_state saved_fpu_state = { 0 };
1899 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1900 			saved_fpu_state = get_fpu_state();
1901 			disable_fpu_denormals();
1902 		}
1903 		for (size_t i = 0; i < range_i; i++) {
1904 			for (size_t j = 0; j < range_j; j++) {
1905 				for (size_t k = 0; k < range_k; k += tile_k) {
1906 					for (size_t l = 0; l < range_l; l += tile_l) {
1907 						task(argument, i, j, k, l,
1908 							min(range_k - k, tile_k), min(range_l - l, tile_l));
1909 					}
1910 				}
1911 			}
1912 		}
1913 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1914 			set_fpu_state(saved_fpu_state);
1915 		}
1916 	} else {
1917 		const size_t tile_range_l = divide_round_up(range_l, tile_l);
1918 		const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1919 		const size_t tile_range = range_i * range_j * tile_range_kl;
1920 		const struct pthreadpool_4d_tile_2d_params params = {
1921 			.range_k = range_k,
1922 			.tile_k = tile_k,
1923 			.range_l = range_l,
1924 			.tile_l = tile_l,
1925 			.range_j = fxdiv_init_size_t(range_j),
1926 			.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1927 			.tile_range_l = fxdiv_init_size_t(tile_range_l),
1928 		};
1929 		thread_function_t parallelize_4d_tile_2d = &thread_parallelize_4d_tile_2d;
1930 		#if PTHREADPOOL_USE_FASTPATH
1931 			const size_t range_threshold = -threads_count;
1932 			if (tile_range < range_threshold) {
1933 				parallelize_4d_tile_2d = &pthreadpool_thread_parallelize_4d_tile_2d_fastpath;
1934 			}
1935 		#endif
1936 		pthreadpool_parallelize(
1937 			threadpool, parallelize_4d_tile_2d, &params, sizeof(params),
1938 			task, argument, tile_range, flags);
1939 	}
1940 }
1941 
pthreadpool_parallelize_4d_tile_2d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_4d_tile_2d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_k,size_t tile_l,uint32_t flags)1942 void pthreadpool_parallelize_4d_tile_2d_with_uarch(
1943 	pthreadpool_t threadpool,
1944 	pthreadpool_task_4d_tile_2d_with_id_t task,
1945 	void* argument,
1946 	uint32_t default_uarch_index,
1947 	uint32_t max_uarch_index,
1948 	size_t range_i,
1949 	size_t range_j,
1950 	size_t range_k,
1951 	size_t range_l,
1952 	size_t tile_k,
1953 	size_t tile_l,
1954 	uint32_t flags)
1955 {
1956 	size_t threads_count;
1957 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) {
1958 		/* No thread pool used: execute task sequentially on the calling thread */
1959 
1960 		uint32_t uarch_index = default_uarch_index;
1961 		#if PTHREADPOOL_USE_CPUINFO
1962 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1963 			if (uarch_index > max_uarch_index) {
1964 				uarch_index = default_uarch_index;
1965 			}
1966 		#endif
1967 
1968 		struct fpu_state saved_fpu_state = { 0 };
1969 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1970 			saved_fpu_state = get_fpu_state();
1971 			disable_fpu_denormals();
1972 		}
1973 		for (size_t i = 0; i < range_i; i++) {
1974 			for (size_t j = 0; j < range_j; j++) {
1975 				for (size_t k = 0; k < range_k; k += tile_k) {
1976 					for (size_t l = 0; l < range_l; l += tile_l) {
1977 						task(argument, uarch_index, i, j, k, l,
1978 							min(range_k - k, tile_k), min(range_l - l, tile_l));
1979 					}
1980 				}
1981 			}
1982 		}
1983 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1984 			set_fpu_state(saved_fpu_state);
1985 		}
1986 	} else {
1987 		const size_t tile_range_l = divide_round_up(range_l, tile_l);
1988 		const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1989 		const size_t tile_range = range_i * range_j * tile_range_kl;
1990 		const struct pthreadpool_4d_tile_2d_with_uarch_params params = {
1991 			.default_uarch_index = default_uarch_index,
1992 			.max_uarch_index = max_uarch_index,
1993 			.range_k = range_k,
1994 			.tile_k = tile_k,
1995 			.range_l = range_l,
1996 			.tile_l = tile_l,
1997 			.range_j = fxdiv_init_size_t(range_j),
1998 			.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1999 			.tile_range_l = fxdiv_init_size_t(tile_range_l),
2000 		};
2001 		thread_function_t parallelize_4d_tile_2d_with_uarch = &thread_parallelize_4d_tile_2d_with_uarch;
2002 		#if PTHREADPOOL_USE_FASTPATH
2003 			const size_t range_threshold = -threads_count;
2004 			if (tile_range < range_threshold) {
2005 				parallelize_4d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath;
2006 			}
2007 		#endif
2008 		pthreadpool_parallelize(
2009 			threadpool, parallelize_4d_tile_2d_with_uarch, &params, sizeof(params),
2010 			task, argument, tile_range, flags);
2011 	}
2012 }
2013 
pthreadpool_parallelize_5d(pthreadpool_t threadpool,pthreadpool_task_5d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,uint32_t flags)2014 void pthreadpool_parallelize_5d(
2015 	pthreadpool_t threadpool,
2016 	pthreadpool_task_5d_t task,
2017 	void* argument,
2018 	size_t range_i,
2019 	size_t range_j,
2020 	size_t range_k,
2021 	size_t range_l,
2022 	size_t range_m,
2023 	uint32_t flags)
2024 {
2025 	size_t threads_count;
2026 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m) <= 1) {
2027 		/* No thread pool used: execute task sequentially on the calling thread */
2028 		struct fpu_state saved_fpu_state = { 0 };
2029 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2030 			saved_fpu_state = get_fpu_state();
2031 			disable_fpu_denormals();
2032 		}
2033 		for (size_t i = 0; i < range_i; i++) {
2034 			for (size_t j = 0; j < range_j; j++) {
2035 				for (size_t k = 0; k < range_k; k++) {
2036 					for (size_t l = 0; l < range_l; l++) {
2037 						for (size_t m = 0; m < range_m; m++) {
2038 							task(argument, i, j, k, l, m);
2039 						}
2040 					}
2041 				}
2042 			}
2043 		}
2044 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2045 			set_fpu_state(saved_fpu_state);
2046 		}
2047 	} else {
2048 		const size_t range_lm = range_l * range_m;
2049 		const size_t range = range_i * range_j * range_k * range_lm;
2050 		const struct pthreadpool_5d_params params = {
2051 			.range_l = range_l,
2052 			.range_j = fxdiv_init_size_t(range_j),
2053 			.range_k = fxdiv_init_size_t(range_k),
2054 			.range_lm = fxdiv_init_size_t(range_lm),
2055 			.range_m = fxdiv_init_size_t(range_m),
2056 		};
2057 		thread_function_t parallelize_5d = &thread_parallelize_5d;
2058 		#if PTHREADPOOL_USE_FASTPATH
2059 			const size_t range_threshold = -threads_count;
2060 			if (range < range_threshold) {
2061 				parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath;
2062 			}
2063 		#endif
2064 		pthreadpool_parallelize(
2065 			threadpool, parallelize_5d, &params, sizeof(params),
2066 			task, argument, range, flags);
2067 	}
2068 }
2069 
pthreadpool_parallelize_5d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_5d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t tile_m,uint32_t flags)2070 void pthreadpool_parallelize_5d_tile_1d(
2071 	pthreadpool_t threadpool,
2072 	pthreadpool_task_5d_tile_1d_t task,
2073 	void* argument,
2074 	size_t range_i,
2075 	size_t range_j,
2076 	size_t range_k,
2077 	size_t range_l,
2078 	size_t range_m,
2079 	size_t tile_m,
2080 	uint32_t flags)
2081 {
2082 	size_t threads_count;
2083 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m)) {
2084 		/* No thread pool used: execute task sequentially on the calling thread */
2085 		struct fpu_state saved_fpu_state = { 0 };
2086 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2087 			saved_fpu_state = get_fpu_state();
2088 			disable_fpu_denormals();
2089 		}
2090 		for (size_t i = 0; i < range_i; i++) {
2091 			for (size_t j = 0; j < range_j; j++) {
2092 				for (size_t k = 0; k < range_k; k++) {
2093 					for (size_t l = 0; l < range_l; l++) {
2094 						for (size_t m = 0; m < range_m; m += tile_m) {
2095 							task(argument, i, j, k, l, m, min(range_m - m, tile_m));
2096 						}
2097 					}
2098 				}
2099 			}
2100 		}
2101 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2102 			set_fpu_state(saved_fpu_state);
2103 		}
2104 	} else {
2105 		const size_t tile_range_m = divide_round_up(range_m, tile_m);
2106 		const size_t range_kl = range_k * range_l;
2107 		const size_t tile_range = range_i * range_j * range_kl * tile_range_m;
2108 		const struct pthreadpool_5d_tile_1d_params params = {
2109 			.range_k = range_k,
2110 			.range_m = range_m,
2111 			.tile_m = tile_m,
2112 			.range_j = fxdiv_init_size_t(range_j),
2113 			.range_kl = fxdiv_init_size_t(range_kl),
2114 			.range_l = fxdiv_init_size_t(range_l),
2115 			.tile_range_m = fxdiv_init_size_t(tile_range_m),
2116 		};
2117 		thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d;
2118 		#if PTHREADPOOL_USE_FASTPATH
2119 			const size_t range_threshold = -threads_count;
2120 			if (tile_range < range_threshold) {
2121 				parallelize_5d_tile_1d = &pthreadpool_thread_parallelize_5d_tile_1d_fastpath;
2122 			}
2123 		#endif
2124 		pthreadpool_parallelize(
2125 			threadpool, parallelize_5d_tile_1d, &params, sizeof(params),
2126 			task, argument, tile_range, flags);
2127 	}
2128 }
2129 
pthreadpool_parallelize_5d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_5d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t tile_l,size_t tile_m,uint32_t flags)2130 void pthreadpool_parallelize_5d_tile_2d(
2131 	pthreadpool_t threadpool,
2132 	pthreadpool_task_5d_tile_2d_t task,
2133 	void* argument,
2134 	size_t range_i,
2135 	size_t range_j,
2136 	size_t range_k,
2137 	size_t range_l,
2138 	size_t range_m,
2139 	size_t tile_l,
2140 	size_t tile_m,
2141 	uint32_t flags)
2142 {
2143 	size_t threads_count;
2144 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l && range_m <= tile_m)) {
2145 		/* No thread pool used: execute task sequentially on the calling thread */
2146 		struct fpu_state saved_fpu_state = { 0 };
2147 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2148 			saved_fpu_state = get_fpu_state();
2149 			disable_fpu_denormals();
2150 		}
2151 		for (size_t i = 0; i < range_i; i++) {
2152 			for (size_t j = 0; j < range_j; j++) {
2153 				for (size_t k = 0; k < range_k; k++) {
2154 					for (size_t l = 0; l < range_l; l += tile_l) {
2155 						for (size_t m = 0; m < range_m; m += tile_m) {
2156 							task(argument, i, j, k, l, m,
2157 								min(range_l - l, tile_l), min(range_m - m, tile_m));
2158 						}
2159 					}
2160 				}
2161 			}
2162 		}
2163 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2164 			set_fpu_state(saved_fpu_state);
2165 		}
2166 	} else {
2167 		const size_t tile_range_m = divide_round_up(range_m, tile_m);
2168 		const size_t tile_range_lm = divide_round_up(range_l, tile_l) * tile_range_m;
2169 		const size_t tile_range = range_i * range_j * range_k * tile_range_lm;
2170 		const struct pthreadpool_5d_tile_2d_params params = {
2171 			.range_l = range_l,
2172 			.tile_l = tile_l,
2173 			.range_m = range_m,
2174 			.tile_m = tile_m,
2175 			.range_j = fxdiv_init_size_t(range_j),
2176 			.range_k = fxdiv_init_size_t(range_k),
2177 			.tile_range_lm = fxdiv_init_size_t(tile_range_lm),
2178 			.tile_range_m = fxdiv_init_size_t(tile_range_m),
2179 		};
2180 		thread_function_t parallelize_5d_tile_2d = &thread_parallelize_5d_tile_2d;
2181 		#if PTHREADPOOL_USE_FASTPATH
2182 			const size_t range_threshold = -threads_count;
2183 			if (tile_range < range_threshold) {
2184 				parallelize_5d_tile_2d = &pthreadpool_thread_parallelize_5d_tile_2d_fastpath;
2185 			}
2186 		#endif
2187 		pthreadpool_parallelize(
2188 			threadpool, parallelize_5d_tile_2d, &params, sizeof(params),
2189 			task, argument, tile_range, flags);
2190 	}
2191 }
2192 
pthreadpool_parallelize_6d(pthreadpool_t threadpool,pthreadpool_task_6d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t range_n,uint32_t flags)2193 void pthreadpool_parallelize_6d(
2194 	pthreadpool_t threadpool,
2195 	pthreadpool_task_6d_t task,
2196 	void* argument,
2197 	size_t range_i,
2198 	size_t range_j,
2199 	size_t range_k,
2200 	size_t range_l,
2201 	size_t range_m,
2202 	size_t range_n,
2203 	uint32_t flags)
2204 {
2205 	size_t threads_count;
2206 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m | range_n) <= 1) {
2207 		/* No thread pool used: execute task sequentially on the calling thread */
2208 		struct fpu_state saved_fpu_state = { 0 };
2209 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2210 			saved_fpu_state = get_fpu_state();
2211 			disable_fpu_denormals();
2212 		}
2213 		for (size_t i = 0; i < range_i; i++) {
2214 			for (size_t j = 0; j < range_j; j++) {
2215 				for (size_t k = 0; k < range_k; k++) {
2216 					for (size_t l = 0; l < range_l; l++) {
2217 						for (size_t m = 0; m < range_m; m++) {
2218 							for (size_t n = 0; n < range_n; n++) {
2219 								task(argument, i, j, k, l, m, n);
2220 							}
2221 						}
2222 					}
2223 				}
2224 			}
2225 		}
2226 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2227 			set_fpu_state(saved_fpu_state);
2228 		}
2229 	} else {
2230 		const size_t range_lmn = range_l * range_m * range_n;
2231 		const size_t range = range_i * range_j * range_k * range_lmn;
2232 		const struct pthreadpool_6d_params params = {
2233 			.range_l = range_l,
2234 			.range_j = fxdiv_init_size_t(range_j),
2235 			.range_k = fxdiv_init_size_t(range_k),
2236 			.range_lmn = fxdiv_init_size_t(range_lmn),
2237 			.range_m = fxdiv_init_size_t(range_m),
2238 			.range_n = fxdiv_init_size_t(range_n),
2239 		};
2240 		thread_function_t parallelize_6d = &thread_parallelize_6d;
2241 		#if PTHREADPOOL_USE_FASTPATH
2242 			const size_t range_threshold = -threads_count;
2243 			if (range < range_threshold) {
2244 				parallelize_6d = &pthreadpool_thread_parallelize_6d_fastpath;
2245 			}
2246 		#endif
2247 		pthreadpool_parallelize(
2248 			threadpool, parallelize_6d, &params, sizeof(params),
2249 			task, argument, range, flags);
2250 	}
2251 }
2252 
pthreadpool_parallelize_6d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_6d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t range_n,size_t tile_n,uint32_t flags)2253 void pthreadpool_parallelize_6d_tile_1d(
2254 	pthreadpool_t threadpool,
2255 	pthreadpool_task_6d_tile_1d_t task,
2256 	void* argument,
2257 	size_t range_i,
2258 	size_t range_j,
2259 	size_t range_k,
2260 	size_t range_l,
2261 	size_t range_m,
2262 	size_t range_n,
2263 	size_t tile_n,
2264 	uint32_t flags)
2265 {
2266 	size_t threads_count;
2267 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l | range_m) <= 1 && range_n <= tile_n)) {
2268 		/* No thread pool used: execute task sequentially on the calling thread */
2269 		struct fpu_state saved_fpu_state = { 0 };
2270 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2271 			saved_fpu_state = get_fpu_state();
2272 			disable_fpu_denormals();
2273 		}
2274 		for (size_t i = 0; i < range_i; i++) {
2275 			for (size_t j = 0; j < range_j; j++) {
2276 				for (size_t k = 0; k < range_k; k++) {
2277 					for (size_t l = 0; l < range_l; l++) {
2278 						for (size_t m = 0; m < range_m; m++) {
2279 							for (size_t n = 0; n < range_n; n += tile_n) {
2280 								task(argument, i, j, k, l, m, n, min(range_n - n, tile_n));
2281 							}
2282 						}
2283 					}
2284 				}
2285 			}
2286 		}
2287 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2288 			set_fpu_state(saved_fpu_state);
2289 		}
2290 	} else {
2291 		const size_t tile_range_n = divide_round_up(range_n, tile_n);
2292 		const size_t tile_range_lmn = range_l * range_m * tile_range_n;
2293 		const size_t tile_range = range_i * range_j * range_k * tile_range_lmn;
2294 		const struct pthreadpool_6d_tile_1d_params params = {
2295 			.range_l = range_l,
2296 			.range_n = range_n,
2297 			.tile_n = tile_n,
2298 			.range_j = fxdiv_init_size_t(range_j),
2299 			.range_k = fxdiv_init_size_t(range_k),
2300 			.tile_range_lmn = fxdiv_init_size_t(tile_range_lmn),
2301 			.range_m = fxdiv_init_size_t(range_m),
2302 			.tile_range_n = fxdiv_init_size_t(tile_range_n),
2303 		};
2304 		thread_function_t parallelize_6d_tile_1d = &thread_parallelize_6d_tile_1d;
2305 		#if PTHREADPOOL_USE_FASTPATH
2306 			const size_t range_threshold = -threads_count;
2307 			if (tile_range < range_threshold) {
2308 				parallelize_6d_tile_1d = &pthreadpool_thread_parallelize_6d_tile_1d_fastpath;
2309 			}
2310 		#endif
2311 		pthreadpool_parallelize(
2312 			threadpool, parallelize_6d_tile_1d, &params, sizeof(params),
2313 			task, argument, tile_range, flags);
2314 	}
2315 }
2316 
pthreadpool_parallelize_6d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_6d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t range_n,size_t tile_m,size_t tile_n,uint32_t flags)2317 void pthreadpool_parallelize_6d_tile_2d(
2318 	pthreadpool_t threadpool,
2319 	pthreadpool_task_6d_tile_2d_t task,
2320 	void* argument,
2321 	size_t range_i,
2322 	size_t range_j,
2323 	size_t range_k,
2324 	size_t range_l,
2325 	size_t range_m,
2326 	size_t range_n,
2327 	size_t tile_m,
2328 	size_t tile_n,
2329 	uint32_t flags)
2330 {
2331 	size_t threads_count;
2332 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m && range_n <= tile_n)) {
2333 		/* No thread pool used: execute task sequentially on the calling thread */
2334 		struct fpu_state saved_fpu_state = { 0 };
2335 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2336 			saved_fpu_state = get_fpu_state();
2337 			disable_fpu_denormals();
2338 		}
2339 		for (size_t i = 0; i < range_i; i++) {
2340 			for (size_t j = 0; j < range_j; j++) {
2341 				for (size_t k = 0; k < range_k; k++) {
2342 					for (size_t l = 0; l < range_l; l++) {
2343 						for (size_t m = 0; m < range_m; m += tile_m) {
2344 							for (size_t n = 0; n < range_n; n += tile_n) {
2345 								task(argument, i, j, k, l, m, n,
2346 									min(range_m - m, tile_m), min(range_n - n, tile_n));
2347 							}
2348 						}
2349 					}
2350 				}
2351 			}
2352 		}
2353 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2354 			set_fpu_state(saved_fpu_state);
2355 		}
2356 	} else {
2357 		const size_t range_kl = range_k * range_l;
2358 		const size_t tile_range_n = divide_round_up(range_n, tile_n);
2359 		const size_t tile_range_mn = divide_round_up(range_m, tile_m) * tile_range_n;
2360 		const size_t tile_range = range_i * range_j * range_kl * tile_range_mn;
2361 		const struct pthreadpool_6d_tile_2d_params params = {
2362 			.range_k = range_k,
2363 			.range_m = range_m,
2364 			.tile_m = tile_m,
2365 			.range_n = range_n,
2366 			.tile_n = tile_n,
2367 			.range_j = fxdiv_init_size_t(range_j),
2368 			.range_kl = fxdiv_init_size_t(range_kl),
2369 			.range_l = fxdiv_init_size_t(range_l),
2370 			.tile_range_mn = fxdiv_init_size_t(tile_range_mn),
2371 			.tile_range_n = fxdiv_init_size_t(tile_range_n),
2372 		};
2373 		thread_function_t parallelize_6d_tile_2d = &thread_parallelize_6d_tile_2d;
2374 		#if PTHREADPOOL_USE_FASTPATH
2375 			const size_t range_threshold = -threads_count;
2376 			if (tile_range < range_threshold) {
2377 				parallelize_6d_tile_2d = &pthreadpool_thread_parallelize_6d_tile_2d_fastpath;
2378 			}
2379 		#endif
2380 		pthreadpool_parallelize(
2381 			threadpool, parallelize_6d_tile_2d, &params, sizeof(params),
2382 			task, argument, tile_range, flags);
2383 	}
2384 }
2385