1 /* Standard C headers */
2 #include <assert.h>
3 #include <stdbool.h>
4 #include <stdint.h>
5 #include <stdlib.h>
6 #include <string.h>
7
8 #if PTHREADPOOL_USE_CPUINFO
9 #include <cpuinfo.h>
10 #endif
11
12 /* Dependencies */
13 #include <fxdiv.h>
14
15 /* Public library header */
16 #include <pthreadpool.h>
17
18 /* Internal library headers */
19 #include "threadpool-atomics.h"
20 #include "threadpool-common.h"
21 #include "threadpool-object.h"
22 #include "threadpool-utils.h"
23
24
pthreadpool_thread_parallelize_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)25 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath(
26 struct pthreadpool* threadpool,
27 struct thread_info* thread)
28 {
29 assert(threadpool != NULL);
30 assert(thread != NULL);
31
32 const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
33 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
34
35 const size_t threads_count = threadpool->threads_count.value;
36 const size_t range_threshold = -threads_count;
37
38 /* Process thread's own range of items */
39 size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
40 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
41 task(argument, range_start++);
42 }
43
44 /* There still may be other threads with work */
45 const size_t thread_number = thread->thread_number;
46 for (size_t tid = modulo_decrement(thread_number, threads_count);
47 tid != thread_number;
48 tid = modulo_decrement(tid, threads_count))
49 {
50 struct thread_info* other_thread = &threadpool->threads[tid];
51 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
52 const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
53 task(argument, index);
54 }
55 }
56
57 /* Make changes by this thread visible to other threads */
58 pthreadpool_fence_release();
59 }
60
pthreadpool_thread_parallelize_1d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)61 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath(
62 struct pthreadpool* threadpool,
63 struct thread_info* thread)
64 {
65 assert(threadpool != NULL);
66 assert(thread != NULL);
67
68 const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
69 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
70
71 const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index;
72 uint32_t uarch_index = default_uarch_index;
73 #if PTHREADPOOL_USE_CPUINFO
74 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
75 if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) {
76 uarch_index = default_uarch_index;
77 }
78 #endif
79
80 const size_t threads_count = threadpool->threads_count.value;
81 const size_t range_threshold = -threads_count;
82
83 /* Process thread's own range of items */
84 size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
85 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
86 task(argument, uarch_index, range_start++);
87 }
88
89 /* There still may be other threads with work */
90 const size_t thread_number = thread->thread_number;
91 for (size_t tid = modulo_decrement(thread_number, threads_count);
92 tid != thread_number;
93 tid = modulo_decrement(tid, threads_count))
94 {
95 struct thread_info* other_thread = &threadpool->threads[tid];
96 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
97 const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
98 task(argument, uarch_index, index);
99 }
100 }
101
102 /* Make changes by this thread visible to other threads */
103 pthreadpool_fence_release();
104 }
105
pthreadpool_thread_parallelize_1d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)106 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath(
107 struct pthreadpool* threadpool,
108 struct thread_info* thread)
109 {
110 assert(threadpool != NULL);
111 assert(thread != NULL);
112
113 const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
114 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
115
116 const size_t threads_count = threadpool->threads_count.value;
117 const size_t range_threshold = -threads_count;
118
119 /* Process thread's own range of items */
120 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
121 const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile;
122 size_t tile_start = range_start * tile;
123
124 const size_t range = threadpool->params.parallelize_1d_tile_1d.range;
125 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
126 task(argument, tile_start, min(range - tile_start, tile));
127 tile_start += tile;
128 }
129
130 /* There still may be other threads with work */
131 const size_t thread_number = thread->thread_number;
132 for (size_t tid = modulo_decrement(thread_number, threads_count);
133 tid != thread_number;
134 tid = modulo_decrement(tid, threads_count))
135 {
136 struct thread_info* other_thread = &threadpool->threads[tid];
137 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
138 const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
139 const size_t tile_start = tile_index * tile;
140 task(argument, tile_start, min(range - tile_start, tile));
141 }
142 }
143
144 /* Make changes by this thread visible to other threads */
145 pthreadpool_fence_release();
146 }
147
pthreadpool_thread_parallelize_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)148 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath(
149 struct pthreadpool* threadpool,
150 struct thread_info* thread)
151 {
152 assert(threadpool != NULL);
153 assert(thread != NULL);
154
155 const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
156 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
157
158 const size_t threads_count = threadpool->threads_count.value;
159 const size_t range_threshold = -threads_count;
160
161 /* Process thread's own range of items */
162 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
163 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j;
164 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j);
165 size_t i = index_i_j.quotient;
166 size_t j = index_i_j.remainder;
167
168 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
169 task(argument, i, j);
170 if (++j == range_j.value) {
171 j = 0;
172 i += 1;
173 }
174 }
175
176 /* There still may be other threads with work */
177 const size_t thread_number = thread->thread_number;
178 for (size_t tid = modulo_decrement(thread_number, threads_count);
179 tid != thread_number;
180 tid = modulo_decrement(tid, threads_count))
181 {
182 struct thread_info* other_thread = &threadpool->threads[tid];
183 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
184 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
185 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j);
186 task(argument, index_i_j.quotient, index_i_j.remainder);
187 }
188 }
189
190 /* Make changes by this thread visible to other threads */
191 pthreadpool_fence_release();
192 }
193
pthreadpool_thread_parallelize_2d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)194 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath(
195 struct pthreadpool* threadpool,
196 struct thread_info* thread)
197 {
198 assert(threadpool != NULL);
199 assert(thread != NULL);
200
201 const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
202 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
203
204 const size_t threads_count = threadpool->threads_count.value;
205 const size_t range_threshold = -threads_count;
206
207 /* Process thread's own range of items */
208 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
209 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j;
210 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
211 const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j;
212 size_t i = tile_index_i_j.quotient;
213 size_t start_j = tile_index_i_j.remainder * tile_j;
214
215 const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j;
216 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
217 task(argument, i, start_j, min(range_j - start_j, tile_j));
218 start_j += tile_j;
219 if (start_j >= range_j) {
220 start_j = 0;
221 i += 1;
222 }
223 }
224
225 /* There still may be other threads with work */
226 const size_t thread_number = thread->thread_number;
227 for (size_t tid = modulo_decrement(thread_number, threads_count);
228 tid != thread_number;
229 tid = modulo_decrement(tid, threads_count))
230 {
231 struct thread_info* other_thread = &threadpool->threads[tid];
232 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
233 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
234 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
235 const size_t start_j = tile_index_i_j.remainder * tile_j;
236 task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j));
237 }
238 }
239
240 /* Make changes by this thread visible to other threads */
241 pthreadpool_fence_release();
242 }
243
pthreadpool_thread_parallelize_2d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)244 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath(
245 struct pthreadpool* threadpool,
246 struct thread_info* thread)
247 {
248 assert(threadpool != NULL);
249 assert(thread != NULL);
250
251 const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
252 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
253
254 const size_t threads_count = threadpool->threads_count.value;
255 const size_t range_threshold = -threads_count;
256
257 /* Process thread's own range of items */
258 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
259 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j;
260 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
261 const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i;
262 const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j;
263 size_t start_i = tile_index_i_j.quotient * tile_i;
264 size_t start_j = tile_index_i_j.remainder * tile_j;
265
266 const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i;
267 const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j;
268 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
269 task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
270 start_j += tile_j;
271 if (start_j >= range_j) {
272 start_j = 0;
273 start_i += tile_i;
274 }
275 }
276
277 /* There still may be other threads with work */
278 const size_t thread_number = thread->thread_number;
279 for (size_t tid = modulo_decrement(thread_number, threads_count);
280 tid != thread_number;
281 tid = modulo_decrement(tid, threads_count))
282 {
283 struct thread_info* other_thread = &threadpool->threads[tid];
284 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
285 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
286 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
287 const size_t start_i = tile_index_i_j.quotient * tile_i;
288 const size_t start_j = tile_index_i_j.remainder * tile_j;
289 task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
290 }
291 }
292
293 /* Make changes by this thread visible to other threads */
294 pthreadpool_fence_release();
295 }
296
pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)297 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath(
298 struct pthreadpool* threadpool,
299 struct thread_info* thread)
300 {
301 assert(threadpool != NULL);
302 assert(thread != NULL);
303
304 const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
305 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
306
307 const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index;
308 uint32_t uarch_index = default_uarch_index;
309 #if PTHREADPOOL_USE_CPUINFO
310 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
311 if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) {
312 uarch_index = default_uarch_index;
313 }
314 #endif
315
316 const size_t threads_count = threadpool->threads_count.value;
317 const size_t range_threshold = -threads_count;
318
319 /* Process thread's own range of items */
320 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j;
321 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
322 const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j);
323 const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i;
324 const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i;
325 const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j;
326 const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j;
327 size_t start_i = index.quotient * tile_i;
328 size_t start_j = index.remainder * tile_j;
329
330 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
331 task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
332 start_j += tile_j;
333 if (start_j >= range_j) {
334 start_j = 0;
335 start_i += tile_i;
336 }
337 }
338
339 /* There still may be other threads with work */
340 const size_t thread_number = thread->thread_number;
341 for (size_t tid = modulo_decrement(thread_number, threads_count);
342 tid != thread_number;
343 tid = modulo_decrement(tid, threads_count))
344 {
345 struct thread_info* other_thread = &threadpool->threads[tid];
346 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
347 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
348 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
349 const size_t start_i = tile_index_i_j.quotient * tile_i;
350 const size_t start_j = tile_index_i_j.remainder * tile_j;
351 task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
352 }
353 }
354
355 /* Make changes by this thread visible to other threads */
356 pthreadpool_fence_release();
357 }
358
pthreadpool_thread_parallelize_3d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)359 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath(
360 struct pthreadpool* threadpool,
361 struct thread_info* thread)
362 {
363 assert(threadpool != NULL);
364 assert(thread != NULL);
365
366 const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
367 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
368
369 const size_t threads_count = threadpool->threads_count.value;
370 const size_t range_threshold = -threads_count;
371
372 /* Process thread's own range of items */
373 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
374 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k;
375 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k);
376 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j;
377 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
378 size_t i = index_i_j.quotient;
379 size_t j = index_i_j.remainder;
380 size_t k = index_ij_k.remainder;
381
382 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
383 task(argument, i, j, k);
384 if (++k == range_k.value) {
385 k = 0;
386 if (++j == range_j.value) {
387 j = 0;
388 i += 1;
389 }
390 }
391 }
392
393 /* There still may be other threads with work */
394 const size_t thread_number = thread->thread_number;
395 for (size_t tid = modulo_decrement(thread_number, threads_count);
396 tid != thread_number;
397 tid = modulo_decrement(tid, threads_count))
398 {
399 struct thread_info* other_thread = &threadpool->threads[tid];
400 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
401 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
402 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k);
403 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
404 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder);
405 }
406 }
407
408 /* Make changes by this thread visible to other threads */
409 pthreadpool_fence_release();
410 }
411
pthreadpool_thread_parallelize_3d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)412 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath(
413 struct pthreadpool* threadpool,
414 struct thread_info* thread)
415 {
416 assert(threadpool != NULL);
417 assert(thread != NULL);
418
419 const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
420 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
421
422 const size_t threads_count = threadpool->threads_count.value;
423 const size_t range_threshold = -threads_count;
424
425 /* Process thread's own range of items */
426 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
427 const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k;
428 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
429 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j;
430 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
431 const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k;
432 size_t i = index_i_j.quotient;
433 size_t j = index_i_j.remainder;
434 size_t start_k = tile_index_ij_k.remainder * tile_k;
435
436 const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k;
437 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
438 task(argument, i, j, start_k, min(range_k - start_k, tile_k));
439 start_k += tile_k;
440 if (start_k >= range_k) {
441 start_k = 0;
442 if (++j == range_j.value) {
443 j = 0;
444 i += 1;
445 }
446 }
447 }
448
449 /* There still may be other threads with work */
450 const size_t thread_number = thread->thread_number;
451 for (size_t tid = modulo_decrement(thread_number, threads_count);
452 tid != thread_number;
453 tid = modulo_decrement(tid, threads_count))
454 {
455 struct thread_info* other_thread = &threadpool->threads[tid];
456 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
457 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
458 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
459 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
460 const size_t start_k = tile_index_ij_k.remainder * tile_k;
461 task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k));
462 }
463 }
464
465 /* Make changes by this thread visible to other threads */
466 pthreadpool_fence_release();
467 }
468
pthreadpool_thread_parallelize_3d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)469 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath(
470 struct pthreadpool* threadpool,
471 struct thread_info* thread)
472 {
473 assert(threadpool != NULL);
474 assert(thread != NULL);
475
476 const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
477 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
478
479 const size_t threads_count = threadpool->threads_count.value;
480 const size_t range_threshold = -threads_count;
481
482 /* Process thread's own range of items */
483 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
484 const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k;
485 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
486 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j;
487 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
488 const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j;
489 const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k;
490 size_t i = tile_index_i_j.quotient;
491 size_t start_j = tile_index_i_j.remainder * tile_j;
492 size_t start_k = tile_index_ij_k.remainder * tile_k;
493
494 const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k;
495 const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j;
496 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
497 task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
498 start_k += tile_k;
499 if (start_k >= range_k) {
500 start_k = 0;
501 start_j += tile_j;
502 if (start_j >= range_j) {
503 start_j = 0;
504 i += 1;
505 }
506 }
507 }
508
509 /* There still may be other threads with work */
510 const size_t thread_number = thread->thread_number;
511 for (size_t tid = modulo_decrement(thread_number, threads_count);
512 tid != thread_number;
513 tid = modulo_decrement(tid, threads_count))
514 {
515 struct thread_info* other_thread = &threadpool->threads[tid];
516 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
517 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
518 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
519 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
520 const size_t start_j = tile_index_i_j.remainder * tile_j;
521 const size_t start_k = tile_index_ij_k.remainder * tile_k;
522 task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
523 }
524 }
525
526 /* Make changes by this thread visible to other threads */
527 pthreadpool_fence_release();
528 }
529
pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)530 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath(
531 struct pthreadpool* threadpool,
532 struct thread_info* thread)
533 {
534 assert(threadpool != NULL);
535 assert(thread != NULL);
536
537 const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
538 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
539
540 const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index;
541 uint32_t uarch_index = default_uarch_index;
542 #if PTHREADPOOL_USE_CPUINFO
543 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
544 if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) {
545 uarch_index = default_uarch_index;
546 }
547 #endif
548
549 const size_t threads_count = threadpool->threads_count.value;
550 const size_t range_threshold = -threads_count;
551
552 /* Process thread's own range of items */
553 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
554 const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k;
555 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
556 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j;
557 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
558 const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j;
559 const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k;
560 size_t i = tile_index_i_j.quotient;
561 size_t start_j = tile_index_i_j.remainder * tile_j;
562 size_t start_k = tile_index_ij_k.remainder * tile_k;
563
564 const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k;
565 const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j;
566 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
567 task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
568 start_k += tile_k;
569 if (start_k >= range_k) {
570 start_k = 0;
571 start_j += tile_j;
572 if (start_j >= range_j) {
573 start_j = 0;
574 i += 1;
575 }
576 }
577 }
578
579 /* There still may be other threads with work */
580 const size_t thread_number = thread->thread_number;
581 for (size_t tid = modulo_decrement(thread_number, threads_count);
582 tid != thread_number;
583 tid = modulo_decrement(tid, threads_count))
584 {
585 struct thread_info* other_thread = &threadpool->threads[tid];
586 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
587 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
588 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
589 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
590 const size_t start_j = tile_index_i_j.remainder * tile_j;
591 const size_t start_k = tile_index_ij_k.remainder * tile_k;
592 task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
593 }
594 }
595
596 /* Make changes by this thread visible to other threads */
597 pthreadpool_fence_release();
598 }
599
pthreadpool_thread_parallelize_4d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)600 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath(
601 struct pthreadpool* threadpool,
602 struct thread_info* thread)
603 {
604 assert(threadpool != NULL);
605 assert(thread != NULL);
606
607 const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
608 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
609
610 const size_t threads_count = threadpool->threads_count.value;
611 const size_t range_threshold = -threads_count;
612
613 /* Process thread's own range of items */
614 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
615 const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl;
616 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl);
617 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j;
618 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
619 const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l;
620 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
621 size_t i = index_i_j.quotient;
622 size_t j = index_i_j.remainder;
623 size_t k = index_k_l.quotient;
624 size_t l = index_k_l.remainder;
625
626 const size_t range_k = threadpool->params.parallelize_4d.range_k;
627 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
628 task(argument, i, j, k, l);
629 if (++l == range_l.value) {
630 l = 0;
631 if (++k == range_k) {
632 k = 0;
633 if (++j == range_j.value) {
634 j = 0;
635 i += 1;
636 }
637 }
638 }
639 }
640
641 /* There still may be other threads with work */
642 const size_t thread_number = thread->thread_number;
643 for (size_t tid = modulo_decrement(thread_number, threads_count);
644 tid != thread_number;
645 tid = modulo_decrement(tid, threads_count))
646 {
647 struct thread_info* other_thread = &threadpool->threads[tid];
648 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
649 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
650 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl);
651 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
652 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
653 task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder);
654 }
655 }
656
657 /* Make changes by this thread visible to other threads */
658 pthreadpool_fence_release();
659 }
660
pthreadpool_thread_parallelize_4d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)661 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath(
662 struct pthreadpool* threadpool,
663 struct thread_info* thread)
664 {
665 assert(threadpool != NULL);
666 assert(thread != NULL);
667
668 const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
669 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
670
671 const size_t threads_count = threadpool->threads_count.value;
672 const size_t range_threshold = -threads_count;
673
674 /* Process thread's own range of items */
675 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
676 const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl;
677 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
678 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j;
679 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
680 const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l;
681 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
682 const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l;
683 size_t i = index_i_j.quotient;
684 size_t j = index_i_j.remainder;
685 size_t k = tile_index_k_l.quotient;
686 size_t start_l = tile_index_k_l.remainder * tile_l;
687
688 const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l;
689 const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k;
690 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
691 task(argument, i, j, k, start_l, min(range_l - start_l, tile_l));
692 start_l += tile_l;
693 if (start_l >= range_l) {
694 start_l = 0;
695 if (++k == range_k) {
696 k = 0;
697 if (++j == range_j.value) {
698 j = 0;
699 i += 1;
700 }
701 }
702 }
703 }
704
705 /* There still may be other threads with work */
706 const size_t thread_number = thread->thread_number;
707 for (size_t tid = modulo_decrement(thread_number, threads_count);
708 tid != thread_number;
709 tid = modulo_decrement(tid, threads_count))
710 {
711 struct thread_info* other_thread = &threadpool->threads[tid];
712 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
713 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
714 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
715 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
716 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
717 const size_t start_l = tile_index_k_l.remainder * tile_l;
718 task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l));
719 }
720 }
721
722 /* Make changes by this thread visible to other threads */
723 pthreadpool_fence_release();
724 }
725
pthreadpool_thread_parallelize_4d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)726 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath(
727 struct pthreadpool* threadpool,
728 struct thread_info* thread)
729 {
730 assert(threadpool != NULL);
731 assert(thread != NULL);
732
733 const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
734 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
735
736 const size_t threads_count = threadpool->threads_count.value;
737 const size_t range_threshold = -threads_count;
738
739 /* Process thread's own range of items */
740 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
741 const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl;
742 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
743 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j;
744 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
745 const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l;
746 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
747 const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k;
748 const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l;
749 size_t i = index_i_j.quotient;
750 size_t j = index_i_j.remainder;
751 size_t start_k = tile_index_k_l.quotient * tile_k;
752 size_t start_l = tile_index_k_l.remainder * tile_l;
753
754 const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l;
755 const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k;
756 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
757 task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
758 start_l += tile_l;
759 if (start_l >= range_l) {
760 start_l = 0;
761 start_k += tile_k;
762 if (start_k >= range_k) {
763 start_k = 0;
764 if (++j == range_j.value) {
765 j = 0;
766 i += 1;
767 }
768 }
769 }
770 }
771
772 /* There still may be other threads with work */
773 const size_t thread_number = thread->thread_number;
774 for (size_t tid = modulo_decrement(thread_number, threads_count);
775 tid != thread_number;
776 tid = modulo_decrement(tid, threads_count))
777 {
778 struct thread_info* other_thread = &threadpool->threads[tid];
779 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
780 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
781 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
782 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
783 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
784 const size_t start_k = tile_index_k_l.quotient * tile_k;
785 const size_t start_l = tile_index_k_l.remainder * tile_l;
786 task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
787 }
788 }
789
790 /* Make changes by this thread visible to other threads */
791 pthreadpool_fence_release();
792 }
793
pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)794 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath(
795 struct pthreadpool* threadpool,
796 struct thread_info* thread)
797 {
798 assert(threadpool != NULL);
799 assert(thread != NULL);
800
801 const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
802 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
803
804 const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index;
805 uint32_t uarch_index = default_uarch_index;
806 #if PTHREADPOOL_USE_CPUINFO
807 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
808 if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) {
809 uarch_index = default_uarch_index;
810 }
811 #endif
812
813 const size_t threads_count = threadpool->threads_count.value;
814 const size_t range_threshold = -threads_count;
815
816 /* Process thread's own range of items */
817 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
818 const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl;
819 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
820 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j;
821 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
822 const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l;
823 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
824 const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k;
825 const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l;
826 size_t i = index_i_j.quotient;
827 size_t j = index_i_j.remainder;
828 size_t start_k = tile_index_k_l.quotient * tile_k;
829 size_t start_l = tile_index_k_l.remainder * tile_l;
830
831 const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l;
832 const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k;
833 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
834 task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
835 start_l += tile_l;
836 if (start_l >= range_l) {
837 start_l = 0;
838 start_k += tile_k;
839 if (start_k >= range_k) {
840 start_k = 0;
841 if (++j == range_j.value) {
842 j = 0;
843 i += 1;
844 }
845 }
846 }
847 }
848
849 /* There still may be other threads with work */
850 const size_t thread_number = thread->thread_number;
851 for (size_t tid = modulo_decrement(thread_number, threads_count);
852 tid != thread_number;
853 tid = modulo_decrement(tid, threads_count))
854 {
855 struct thread_info* other_thread = &threadpool->threads[tid];
856 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
857 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
858 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
859 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
860 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
861 const size_t start_k = tile_index_k_l.quotient * tile_k;
862 const size_t start_l = tile_index_k_l.remainder * tile_l;
863 task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
864 }
865 }
866
867 /* Make changes by this thread visible to other threads */
868 pthreadpool_fence_release();
869 }
870
pthreadpool_thread_parallelize_5d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)871 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath(
872 struct pthreadpool* threadpool,
873 struct thread_info* thread)
874 {
875 assert(threadpool != NULL);
876 assert(thread != NULL);
877
878 const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
879 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
880
881 const size_t threads_count = threadpool->threads_count.value;
882 const size_t range_threshold = -threads_count;
883
884 /* Process thread's own range of items */
885 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
886 const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm;
887 const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm);
888 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k;
889 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
890 const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m;
891 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
892 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j;
893 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
894 size_t i = index_i_j.quotient;
895 size_t j = index_i_j.remainder;
896 size_t k = index_ij_k.remainder;
897 size_t l = index_l_m.quotient;
898 size_t m = index_l_m.remainder;
899
900 const size_t range_l = threadpool->params.parallelize_5d.range_l;
901 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
902 task(argument, i, j, k, l, m);
903 if (++m == range_m.value) {
904 m = 0;
905 if (++l == range_l) {
906 l = 0;
907 if (++k == range_k.value) {
908 k = 0;
909 if (++j == range_j.value) {
910 j = 0;
911 i += 1;
912 }
913 }
914 }
915 }
916 }
917
918 /* There still may be other threads with work */
919 const size_t thread_number = thread->thread_number;
920 for (size_t tid = modulo_decrement(thread_number, threads_count);
921 tid != thread_number;
922 tid = modulo_decrement(tid, threads_count))
923 {
924 struct thread_info* other_thread = &threadpool->threads[tid];
925 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
926 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
927 const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm);
928 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
929 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
930 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
931 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder);
932 }
933 }
934
935 /* Make changes by this thread visible to other threads */
936 pthreadpool_fence_release();
937 }
938
pthreadpool_thread_parallelize_5d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)939 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath(
940 struct pthreadpool* threadpool,
941 struct thread_info* thread)
942 {
943 assert(threadpool != NULL);
944 assert(thread != NULL);
945
946 const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
947 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
948
949 const size_t threads_count = threadpool->threads_count.value;
950 const size_t range_threshold = -threads_count;
951
952 /* Process thread's own range of items */
953 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
954 const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m;
955 const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m);
956 const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl;
957 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
958 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j;
959 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
960 const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l;
961 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
962 const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m;
963 size_t i = index_i_j.quotient;
964 size_t j = index_i_j.remainder;
965 size_t k = index_k_l.quotient;
966 size_t l = index_k_l.remainder;
967 size_t start_m = tile_index_ijkl_m.remainder * tile_m;
968
969 const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m;
970 const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k;
971 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
972 task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m));
973 start_m += tile_m;
974 if (start_m >= range_m) {
975 start_m = 0;
976 if (++l == range_l.value) {
977 l = 0;
978 if (++k == range_k) {
979 k = 0;
980 if (++j == range_j.value) {
981 j = 0;
982 i += 1;
983 }
984 }
985 }
986 }
987 }
988
989 /* There still may be other threads with work */
990 const size_t thread_number = thread->thread_number;
991 for (size_t tid = modulo_decrement(thread_number, threads_count);
992 tid != thread_number;
993 tid = modulo_decrement(tid, threads_count))
994 {
995 struct thread_info* other_thread = &threadpool->threads[tid];
996 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
997 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
998 const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m);
999 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
1000 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1001 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1002 size_t start_m = tile_index_ijkl_m.remainder * tile_m;
1003 task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m,
1004 min(range_m - start_m, tile_m));
1005 }
1006 }
1007
1008 /* Make changes by this thread visible to other threads */
1009 pthreadpool_fence_release();
1010 }
1011
pthreadpool_thread_parallelize_5d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1012 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath(
1013 struct pthreadpool* threadpool,
1014 struct thread_info* thread)
1015 {
1016 assert(threadpool != NULL);
1017 assert(thread != NULL);
1018
1019 const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1020 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1021
1022 const size_t threads_count = threadpool->threads_count.value;
1023 const size_t range_threshold = -threads_count;
1024
1025 /* Process thread's own range of items */
1026 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1027 const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm;
1028 const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm);
1029 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k;
1030 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
1031 const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m;
1032 const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
1033 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j;
1034 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1035 const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l;
1036 const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m;
1037 size_t i = index_i_j.quotient;
1038 size_t j = index_i_j.remainder;
1039 size_t k = index_ij_k.remainder;
1040 size_t start_l = tile_index_l_m.quotient * tile_l;
1041 size_t start_m = tile_index_l_m.remainder * tile_m;
1042
1043 const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m;
1044 const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l;
1045 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1046 task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
1047 start_m += tile_m;
1048 if (start_m >= range_m) {
1049 start_m = 0;
1050 start_l += tile_l;
1051 if (start_l >= range_l) {
1052 start_l = 0;
1053 if (++k == range_k.value) {
1054 k = 0;
1055 if (++j == range_j.value) {
1056 j = 0;
1057 i += 1;
1058 }
1059 }
1060 }
1061 }
1062 }
1063
1064 /* There still may be other threads with work */
1065 const size_t thread_number = thread->thread_number;
1066 for (size_t tid = modulo_decrement(thread_number, threads_count);
1067 tid != thread_number;
1068 tid = modulo_decrement(tid, threads_count))
1069 {
1070 struct thread_info* other_thread = &threadpool->threads[tid];
1071 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1072 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1073 const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm);
1074 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
1075 const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
1076 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1077 const size_t start_l = tile_index_l_m.quotient * tile_l;
1078 const size_t start_m = tile_index_l_m.remainder * tile_m;
1079 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder,
1080 start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
1081 }
1082 }
1083
1084 /* Make changes by this thread visible to other threads */
1085 pthreadpool_fence_release();
1086 }
1087
pthreadpool_thread_parallelize_6d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1088 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_fastpath(
1089 struct pthreadpool* threadpool,
1090 struct thread_info* thread)
1091 {
1092 assert(threadpool != NULL);
1093 assert(thread != NULL);
1094
1095 const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1096 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1097
1098 const size_t threads_count = threadpool->threads_count.value;
1099 const size_t range_threshold = -threads_count;
1100
1101 /* Process thread's own range of items */
1102 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1103 const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn;
1104 const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn);
1105 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k;
1106 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1107 const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n;
1108 const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1109 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j;
1110 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1111 const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m;
1112 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1113 size_t i = index_i_j.quotient;
1114 size_t j = index_i_j.remainder;
1115 size_t k = index_ij_k.remainder;
1116 size_t l = index_l_m.quotient;
1117 size_t m = index_l_m.remainder;
1118 size_t n = index_lm_n.remainder;
1119
1120 const size_t range_l = threadpool->params.parallelize_6d.range_l;
1121 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1122 task(argument, i, j, k, l, m, n);
1123 if (++n == range_n.value) {
1124 n = 0;
1125 if (++m == range_m.value) {
1126 m = 0;
1127 if (++l == range_l) {
1128 l = 0;
1129 if (++k == range_k.value) {
1130 k = 0;
1131 if (++j == range_j.value) {
1132 j = 0;
1133 i += 1;
1134 }
1135 }
1136 }
1137 }
1138 }
1139 }
1140
1141
1142 /* There still may be other threads with work */
1143 const size_t thread_number = thread->thread_number;
1144 for (size_t tid = modulo_decrement(thread_number, threads_count);
1145 tid != thread_number;
1146 tid = modulo_decrement(tid, threads_count))
1147 {
1148 struct thread_info* other_thread = &threadpool->threads[tid];
1149 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1150 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1151 const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn);
1152 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1153 const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1154 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1155 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1156 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder);
1157 }
1158 }
1159
1160 /* Make changes by this thread visible to other threads */
1161 pthreadpool_fence_release();
1162 }
1163
pthreadpool_thread_parallelize_6d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1164 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_1d_fastpath(
1165 struct pthreadpool* threadpool,
1166 struct thread_info* thread)
1167 {
1168 assert(threadpool != NULL);
1169 assert(thread != NULL);
1170
1171 const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1172 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1173
1174 const size_t threads_count = threadpool->threads_count.value;
1175 const size_t range_threshold = -threads_count;
1176
1177 /* Process thread's own range of items */
1178 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1179 const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn;
1180 const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn);
1181 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k;
1182 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1183 const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n;
1184 const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1185 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j;
1186 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1187 const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m;
1188 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1189 const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n;
1190 size_t i = index_i_j.quotient;
1191 size_t j = index_i_j.remainder;
1192 size_t k = index_ij_k.remainder;
1193 size_t l = index_l_m.quotient;
1194 size_t m = index_l_m.remainder;
1195 size_t start_n = tile_index_lm_n.remainder * tile_n;
1196
1197 const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n;
1198 const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l;
1199 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1200 task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n));
1201 start_n += tile_n;
1202 if (start_n >= range_n) {
1203 start_n = 0;
1204 if (++m == range_m.value) {
1205 m = 0;
1206 if (++l == range_l) {
1207 l = 0;
1208 if (++k == range_k.value) {
1209 k = 0;
1210 if (++j == range_j.value) {
1211 j = 0;
1212 i += 1;
1213 }
1214 }
1215 }
1216 }
1217 }
1218 }
1219
1220
1221 /* There still may be other threads with work */
1222 const size_t thread_number = thread->thread_number;
1223 for (size_t tid = modulo_decrement(thread_number, threads_count);
1224 tid != thread_number;
1225 tid = modulo_decrement(tid, threads_count))
1226 {
1227 struct thread_info* other_thread = &threadpool->threads[tid];
1228 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1229 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1230 const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn);
1231 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1232 const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1233 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1234 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1235 const size_t start_n = tile_index_lm_n.remainder * tile_n;
1236 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder,
1237 start_n, min(range_n - start_n, tile_n));
1238 }
1239 }
1240
1241 /* Make changes by this thread visible to other threads */
1242 pthreadpool_fence_release();
1243 }
1244
pthreadpool_thread_parallelize_6d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1245 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath(
1246 struct pthreadpool* threadpool,
1247 struct thread_info* thread)
1248 {
1249 assert(threadpool != NULL);
1250 assert(thread != NULL);
1251
1252 const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1253 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1254
1255 const size_t threads_count = threadpool->threads_count.value;
1256 const size_t range_threshold = -threads_count;
1257
1258 /* Process thread's own range of items */
1259 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1260 const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn;
1261 const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn);
1262 const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl;
1263 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1264 const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n;
1265 const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1266 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j;
1267 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1268 const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l;
1269 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1270 const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m;
1271 const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n;
1272 size_t i = index_i_j.quotient;
1273 size_t j = index_i_j.remainder;
1274 size_t k = index_k_l.quotient;
1275 size_t l = index_k_l.remainder;
1276 size_t start_m = tile_index_m_n.quotient * tile_m;
1277 size_t start_n = tile_index_m_n.remainder * tile_n;
1278
1279 const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n;
1280 const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m;
1281 const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k;
1282 while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1283 task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1284 start_n += tile_n;
1285 if (start_n >= range_n) {
1286 start_n = 0;
1287 start_m += tile_m;
1288 if (start_m >= range_m) {
1289 start_m = 0;
1290 if (++l == range_l.value) {
1291 l = 0;
1292 if (++k == range_k) {
1293 k = 0;
1294 if (++j == range_j.value) {
1295 j = 0;
1296 i += 1;
1297 }
1298 }
1299 }
1300 }
1301 }
1302 }
1303
1304 /* There still may be other threads with work */
1305 const size_t thread_number = thread->thread_number;
1306 for (size_t tid = modulo_decrement(thread_number, threads_count);
1307 tid != thread_number;
1308 tid = modulo_decrement(tid, threads_count))
1309 {
1310 struct thread_info* other_thread = &threadpool->threads[tid];
1311 while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1312 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1313 const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn);
1314 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1315 const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1316 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1317 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1318 const size_t start_m = tile_index_m_n.quotient * tile_m;
1319 const size_t start_n = tile_index_m_n.remainder * tile_n;
1320 task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder,
1321 start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1322 }
1323 }
1324
1325 /* Make changes by this thread visible to other threads */
1326 pthreadpool_fence_release();
1327 }
1328