1 #ifndef PTHREADPOOL_H_
2 #define PTHREADPOOL_H_
3
4 #include <stddef.h>
5 #include <stdint.h>
6
7 typedef struct pthreadpool* pthreadpool_t;
8
9 typedef void (*pthreadpool_task_1d_t)(void*, size_t);
10 typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t);
11 typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
12 typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
13 typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t);
14 typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
15 typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
16 typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
17 typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
18 typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t, size_t, size_t);
19 typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
20 typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
21 typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
22 typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
23 typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
24 typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
25 typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
26 typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
27 typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
28 typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
29
30 typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
31 typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t);
32 typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
33 typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
34 typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
35 typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
36
37 typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
38 typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
39
40
41 /**
42 * Disable support for denormalized numbers to the maximum extent possible for
43 * the duration of the computation.
44 *
45 * Handling denormalized floating-point numbers is often implemented in
46 * microcode, and incurs significant performance degradation. This hint
47 * instructs the thread pool to disable support for denormalized numbers before
48 * running the computation by manipulating architecture-specific control
49 * registers, and restore the initial value of control registers after the
50 * computation is complete. The thread pool temporary disables denormalized
51 * numbers on all threads involved in the computation (i.e. the caller threads,
52 * and potentially worker threads).
53 *
54 * Disabling denormalized numbers may have a small negative effect on results'
55 * accuracy. As various architectures differ in capabilities to control
56 * processing of denormalized numbers, using this flag may also hurt results'
57 * reproducibility across different instruction set architectures.
58 */
59 #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
60
61 /**
62 * Yield worker threads to the system scheduler after the operation is finished.
63 *
64 * Force workers to use kernel wait (instead of active spin-wait by default) for
65 * new commands after this command is processed. This flag affects only the
66 * immediate next operation on this thread pool. To make the thread pool always
67 * use kernel wait, pass this flag to all parallelization functions.
68 */
69 #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
70
71 #ifdef __cplusplus
72 extern "C" {
73 #endif
74
75 /**
76 * Create a thread pool with the specified number of threads.
77 *
78 * @param threads_count the number of threads in the thread pool.
79 * A value of 0 has special interpretation: it creates a thread pool with as
80 * many threads as there are logical processors in the system.
81 *
82 * @returns A pointer to an opaque thread pool object if the call is
83 * successful, or NULL pointer if the call failed.
84 */
85 pthreadpool_t pthreadpool_create(size_t threads_count);
86
87 /**
88 * Query the number of threads in a thread pool.
89 *
90 * @param threadpool the thread pool to query.
91 *
92 * @returns The number of threads in the thread pool.
93 */
94 size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
95
96 /**
97 * Process items on a 1D grid.
98 *
99 * The function implements a parallel version of the following snippet:
100 *
101 * for (size_t i = 0; i < range; i++)
102 * function(context, i);
103 *
104 * When the function returns, all items have been processed and the thread pool
105 * is ready for a new task.
106 *
107 * @note If multiple threads call this function with the same thread pool, the
108 * calls are serialized.
109 *
110 * @param threadpool the thread pool to use for parallelisation. If threadpool
111 * is NULL, all items are processed serially on the calling thread.
112 * @param function the function to call for each item.
113 * @param context the first argument passed to the specified function.
114 * @param range the number of items on the 1D grid to process. The
115 * specified function will be called once for each item.
116 * @param flags a bitwise combination of zero or more optional flags
117 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
118 */
119 void pthreadpool_parallelize_1d(
120 pthreadpool_t threadpool,
121 pthreadpool_task_1d_t function,
122 void* context,
123 size_t range,
124 uint32_t flags);
125
126 /**
127 * Process items on a 1D grid passing along the current thread id.
128 *
129 * The function implements a parallel version of the following snippet:
130 *
131 * for (size_t i = 0; i < range; i++)
132 * function(context, thread_index, i);
133 *
134 * When the function returns, all items have been processed and the thread pool
135 * is ready for a new task.
136 *
137 * @note If multiple threads call this function with the same thread pool, the
138 * calls are serialized.
139 *
140 * @param threadpool the thread pool to use for parallelisation. If threadpool
141 * is NULL, all items are processed serially on the calling thread.
142 * @param function the function to call for each item.
143 * @param context the first argument passed to the specified function.
144 * @param range the number of items on the 1D grid to process. The
145 * specified function will be called once for each item.
146 * @param flags a bitwise combination of zero or more optional flags
147 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
148 */
149 void pthreadpool_parallelize_1d_with_thread(
150 pthreadpool_t threadpool,
151 pthreadpool_task_1d_with_thread_t function,
152 void* context,
153 size_t range,
154 uint32_t flags);
155
156 /**
157 * Process items on a 1D grid using a microarchitecture-aware task function.
158 *
159 * The function implements a parallel version of the following snippet:
160 *
161 * uint32_t uarch_index = cpuinfo_initialize() ?
162 * cpuinfo_get_current_uarch_index() : default_uarch_index;
163 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
164 * for (size_t i = 0; i < range; i++)
165 * function(context, uarch_index, i);
166 *
167 * When the function returns, all items have been processed and the thread pool
168 * is ready for a new task.
169 *
170 * @note If multiple threads call this function with the same thread pool, the
171 * calls are serialized.
172 *
173 * @param threadpool the thread pool to use for parallelisation. If
174 * threadpool is NULL, all items are processed serially on the calling
175 * thread.
176 * @param function the function to call for each item.
177 * @param context the first argument passed to the specified
178 * function.
179 * @param default_uarch_index the microarchitecture index to use when
180 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
181 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
182 * max_uarch_index value.
183 * @param max_uarch_index the maximum microarchitecture index expected by
184 * the specified function. If the index returned by
185 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
186 * will be used instead. default_uarch_index can exceed max_uarch_index.
187 * @param range the number of items on the 1D grid to process.
188 * The specified function will be called once for each item.
189 * @param flags a bitwise combination of zero or more optional
190 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
191 * PTHREADPOOL_FLAG_YIELD_WORKERS)
192 */
193 void pthreadpool_parallelize_1d_with_uarch(
194 pthreadpool_t threadpool,
195 pthreadpool_task_1d_with_id_t function,
196 void* context,
197 uint32_t default_uarch_index,
198 uint32_t max_uarch_index,
199 size_t range,
200 uint32_t flags);
201
202 /**
203 * Process items on a 1D grid with specified maximum tile size.
204 *
205 * The function implements a parallel version of the following snippet:
206 *
207 * for (size_t i = 0; i < range; i += tile)
208 * function(context, i, min(range - i, tile));
209 *
210 * When the call returns, all items have been processed and the thread pool is
211 * ready for a new task.
212 *
213 * @note If multiple threads call this function with the same thread pool,
214 * the calls are serialized.
215 *
216 * @param threadpool the thread pool to use for parallelisation. If threadpool
217 * is NULL, all items are processed serially on the calling thread.
218 * @param function the function to call for each tile.
219 * @param context the first argument passed to the specified function.
220 * @param range the number of items on the 1D grid to process.
221 * @param tile the maximum number of items on the 1D grid to process in
222 * one function call.
223 * @param flags a bitwise combination of zero or more optional flags
224 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
225 */
226 void pthreadpool_parallelize_1d_tile_1d(
227 pthreadpool_t threadpool,
228 pthreadpool_task_1d_tile_1d_t function,
229 void* context,
230 size_t range,
231 size_t tile,
232 uint32_t flags);
233
234 /**
235 * Process items on a 2D grid.
236 *
237 * The function implements a parallel version of the following snippet:
238 *
239 * for (size_t i = 0; i < range_i; i++)
240 * for (size_t j = 0; j < range_j; j++)
241 * function(context, i, j);
242 *
243 * When the function returns, all items have been processed and the thread pool
244 * is ready for a new task.
245 *
246 * @note If multiple threads call this function with the same thread pool, the
247 * calls are serialized.
248 *
249 * @param threadpool the thread pool to use for parallelisation. If threadpool
250 * is NULL, all items are processed serially on the calling thread.
251 * @param function the function to call for each item.
252 * @param context the first argument passed to the specified function.
253 * @param range_i the number of items to process along the first dimension
254 * of the 2D grid.
255 * @param range_j the number of items to process along the second dimension
256 * of the 2D grid.
257 * @param flags a bitwise combination of zero or more optional flags
258 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
259 */
260 void pthreadpool_parallelize_2d(
261 pthreadpool_t threadpool,
262 pthreadpool_task_2d_t function,
263 void* context,
264 size_t range_i,
265 size_t range_j,
266 uint32_t flags);
267
268 /**
269 * Process items on a 2D grid passing along the current thread id.
270 *
271 * The function implements a parallel version of the following snippet:
272 *
273 * for (size_t i = 0; i < range_i; i++)
274 * for (size_t j = 0; j < range_j; j++)
275 * function(context, thread_index, i, j);
276 *
277 * When the function returns, all items have been processed and the thread pool
278 * is ready for a new task.
279 *
280 * @note If multiple threads call this function with the same thread pool, the
281 * calls are serialized.
282 *
283 * @param threadpool the thread pool to use for parallelisation. If threadpool
284 * is NULL, all items are processed serially on the calling thread.
285 * @param function the function to call for each item.
286 * @param context the first argument passed to the specified function.
287 * @param range_i the number of items to process along the first dimension
288 * of the 2D grid.
289 * @param range_j the number of items to process along the second dimension
290 * of the 2D grid.
291 * @param flags a bitwise combination of zero or more optional flags
292 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
293 */
294 void pthreadpool_parallelize_2d_with_thread(
295 pthreadpool_t threadpool,
296 pthreadpool_task_2d_with_thread_t function,
297 void* context,
298 size_t range_i,
299 size_t range_j,
300 uint32_t flags);
301
302 /**
303 * Process items on a 2D grid with the specified maximum tile size along the
304 * last grid dimension.
305 *
306 * The function implements a parallel version of the following snippet:
307 *
308 * for (size_t i = 0; i < range_i; i++)
309 * for (size_t j = 0; j < range_j; j += tile_j)
310 * function(context, i, j, min(range_j - j, tile_j));
311 *
312 * When the function returns, all items have been processed and the thread pool
313 * is ready for a new task.
314 *
315 * @note If multiple threads call this function with the same thread pool, the
316 * calls are serialized.
317 *
318 * @param threadpool the thread pool to use for parallelisation. If threadpool
319 * is NULL, all items are processed serially on the calling thread.
320 * @param function the function to call for each tile.
321 * @param context the first argument passed to the specified function.
322 * @param range_i the number of items to process along the first dimension
323 * of the 2D grid.
324 * @param range_j the number of items to process along the second dimension
325 * of the 2D grid.
326 * @param tile_j the maximum number of items along the second dimension of
327 * the 2D grid to process in one function call.
328 * @param flags a bitwise combination of zero or more optional flags
329 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
330 */
331 void pthreadpool_parallelize_2d_tile_1d(
332 pthreadpool_t threadpool,
333 pthreadpool_task_2d_tile_1d_t function,
334 void* context,
335 size_t range_i,
336 size_t range_j,
337 size_t tile_j,
338 uint32_t flags);
339
340 /**
341 * Process items on a 2D grid with the specified maximum tile size along the
342 * last grid dimension using a microarchitecture-aware task function.
343 *
344 * The function implements a parallel version of the following snippet:
345 *
346 * uint32_t uarch_index = cpuinfo_initialize() ?
347 * cpuinfo_get_current_uarch_index() : default_uarch_index;
348 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
349 * for (size_t i = 0; i < range_i; i++)
350 * for (size_t j = 0; j < range_j; j += tile_j)
351 * function(context, uarch_index, i, j, min(range_j - j, tile_j));
352 *
353 * When the function returns, all items have been processed and the thread pool
354 * is ready for a new task.
355 *
356 * @note If multiple threads call this function with the same thread pool, the
357 * calls are serialized.
358 *
359 * @param threadpool the thread pool to use for parallelisation. If threadpool
360 * is NULL, all items are processed serially on the calling thread.
361 * @param function the function to call for each tile.
362 * @param context the first argument passed to the specified function.
363 * @param default_uarch_index the microarchitecture index to use when
364 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
365 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
366 * max_uarch_index value.
367 * @param max_uarch_index the maximum microarchitecture index expected by
368 * the specified function. If the index returned by
369 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
370 * will be used instead. default_uarch_index can exceed max_uarch_index.
371 * @param range_i the number of items to process along the first dimension
372 * of the 2D grid.
373 * @param range_j the number of items to process along the second dimension
374 * of the 2D grid.
375 * @param tile_j the maximum number of items along the second dimension of
376 * the 2D grid to process in one function call.
377 * @param flags a bitwise combination of zero or more optional flags
378 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
379 */
380 void pthreadpool_parallelize_2d_tile_1d_with_uarch(
381 pthreadpool_t threadpool,
382 pthreadpool_task_2d_tile_1d_with_id_t function,
383 void* context,
384 uint32_t default_uarch_index,
385 uint32_t max_uarch_index,
386 size_t range_i,
387 size_t range_j,
388 size_t tile_j,
389 uint32_t flags);
390
391 /**
392 * Process items on a 2D grid with the specified maximum tile size along the
393 * last grid dimension using a microarchitecture-aware task function and passing
394 * along the current thread id.
395 *
396 * The function implements a parallel version of the following snippet:
397 *
398 * uint32_t uarch_index = cpuinfo_initialize() ?
399 * cpuinfo_get_current_uarch_index() : default_uarch_index;
400 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
401 * for (size_t i = 0; i < range_i; i++)
402 * for (size_t j = 0; j < range_j; j += tile_j)
403 * function(context, uarch_index, thread_index, i, j, min(range_j - j, tile_j));
404 *
405 * When the function returns, all items have been processed and the thread pool
406 * is ready for a new task.
407 *
408 * @note If multiple threads call this function with the same thread pool, the
409 * calls are serialized.
410 *
411 * @param threadpool the thread pool to use for parallelisation. If threadpool
412 * is NULL, all items are processed serially on the calling thread.
413 * @param function the function to call for each tile.
414 * @param context the first argument passed to the specified function.
415 * @param default_uarch_index the microarchitecture index to use when
416 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
417 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
418 * max_uarch_index value.
419 * @param max_uarch_index the maximum microarchitecture index expected by
420 * the specified function. If the index returned by
421 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
422 * will be used instead. default_uarch_index can exceed max_uarch_index.
423 * @param range_i the number of items to process along the first dimension
424 * of the 2D grid.
425 * @param range_j the number of items to process along the second dimension
426 * of the 2D grid.
427 * @param tile_j the maximum number of items along the second dimension of
428 * the 2D grid to process in one function call.
429 * @param flags a bitwise combination of zero or more optional flags
430 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
431 */
432 void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(
433 pthreadpool_t threadpool,
434 pthreadpool_task_2d_tile_1d_with_id_with_thread_t function,
435 void* context,
436 uint32_t default_uarch_index,
437 uint32_t max_uarch_index,
438 size_t range_i,
439 size_t range_j,
440 size_t tile_j,
441 uint32_t flags);
442
443 /**
444 * Process items on a 2D grid with the specified maximum tile size along each
445 * grid dimension.
446 *
447 * The function implements a parallel version of the following snippet:
448 *
449 * for (size_t i = 0; i < range_i; i += tile_i)
450 * for (size_t j = 0; j < range_j; j += tile_j)
451 * function(context, i, j,
452 * min(range_i - i, tile_i), min(range_j - j, tile_j));
453 *
454 * When the function returns, all items have been processed and the thread pool
455 * is ready for a new task.
456 *
457 * @note If multiple threads call this function with the same thread pool, the
458 * calls are serialized.
459 *
460 * @param threadpool the thread pool to use for parallelisation. If threadpool
461 * is NULL, all items are processed serially on the calling thread.
462 * @param function the function to call for each tile.
463 * @param context the first argument passed to the specified function.
464 * @param range_i the number of items to process along the first dimension
465 * of the 2D grid.
466 * @param range_j the number of items to process along the second dimension
467 * of the 2D grid.
468 * @param tile_j the maximum number of items along the first dimension of
469 * the 2D grid to process in one function call.
470 * @param tile_j the maximum number of items along the second dimension of
471 * the 2D grid to process in one function call.
472 * @param flags a bitwise combination of zero or more optional flags
473 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
474 */
475 void pthreadpool_parallelize_2d_tile_2d(
476 pthreadpool_t threadpool,
477 pthreadpool_task_2d_tile_2d_t function,
478 void* context,
479 size_t range_i,
480 size_t range_j,
481 size_t tile_i,
482 size_t tile_j,
483 uint32_t flags);
484
485 /**
486 * Process items on a 2D grid with the specified maximum tile size along each
487 * grid dimension using a microarchitecture-aware task function.
488 *
489 * The function implements a parallel version of the following snippet:
490 *
491 * uint32_t uarch_index = cpuinfo_initialize() ?
492 * cpuinfo_get_current_uarch_index() : default_uarch_index;
493 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
494 * for (size_t i = 0; i < range_i; i += tile_i)
495 * for (size_t j = 0; j < range_j; j += tile_j)
496 * function(context, uarch_index, i, j,
497 * min(range_i - i, tile_i), min(range_j - j, tile_j));
498 *
499 * When the function returns, all items have been processed and the thread pool
500 * is ready for a new task.
501 *
502 * @note If multiple threads call this function with the same thread pool, the
503 * calls are serialized.
504 *
505 * @param threadpool the thread pool to use for parallelisation. If
506 * threadpool is NULL, all items are processed serially on the calling
507 * thread.
508 * @param function the function to call for each tile.
509 * @param context the first argument passed to the specified
510 * function.
511 * @param default_uarch_index the microarchitecture index to use when
512 * pthreadpool is configured without cpuinfo,
513 * cpuinfo initialization failed, or index returned
514 * by cpuinfo_get_current_uarch_index() exceeds
515 * the max_uarch_index value.
516 * @param max_uarch_index the maximum microarchitecture index expected
517 * by the specified function. If the index returned
518 * by cpuinfo_get_current_uarch_index() exceeds this
519 * value, default_uarch_index will be used instead.
520 * default_uarch_index can exceed max_uarch_index.
521 * @param range_i the number of items to process along the first
522 * dimension of the 2D grid.
523 * @param range_j the number of items to process along the second
524 * dimension of the 2D grid.
525 * @param tile_j the maximum number of items along the first
526 * dimension of the 2D grid to process in one function call.
527 * @param tile_j the maximum number of items along the second
528 * dimension of the 2D grid to process in one function call.
529 * @param flags a bitwise combination of zero or more optional
530 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
531 * PTHREADPOOL_FLAG_YIELD_WORKERS)
532 */
533 void pthreadpool_parallelize_2d_tile_2d_with_uarch(
534 pthreadpool_t threadpool,
535 pthreadpool_task_2d_tile_2d_with_id_t function,
536 void* context,
537 uint32_t default_uarch_index,
538 uint32_t max_uarch_index,
539 size_t range_i,
540 size_t range_j,
541 size_t tile_i,
542 size_t tile_j,
543 uint32_t flags);
544
545 /**
546 * Process items on a 3D grid.
547 *
548 * The function implements a parallel version of the following snippet:
549 *
550 * for (size_t i = 0; i < range_i; i++)
551 * for (size_t j = 0; j < range_j; j++)
552 * for (size_t k = 0; k < range_k; k++)
553 * function(context, i, j, k);
554 *
555 * When the function returns, all items have been processed and the thread pool
556 * is ready for a new task.
557 *
558 * @note If multiple threads call this function with the same thread pool, the
559 * calls are serialized.
560 *
561 * @param threadpool the thread pool to use for parallelisation. If threadpool
562 * is NULL, all items are processed serially on the calling thread.
563 * @param function the function to call for each tile.
564 * @param context the first argument passed to the specified function.
565 * @param range_i the number of items to process along the first dimension
566 * of the 3D grid.
567 * @param range_j the number of items to process along the second dimension
568 * of the 3D grid.
569 * @param range_k the number of items to process along the third dimension
570 * of the 3D grid.
571 * @param flags a bitwise combination of zero or more optional flags
572 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
573 */
574 void pthreadpool_parallelize_3d(
575 pthreadpool_t threadpool,
576 pthreadpool_task_3d_t function,
577 void* context,
578 size_t range_i,
579 size_t range_j,
580 size_t range_k,
581 uint32_t flags);
582
583 /**
584 * Process items on a 3D grid with the specified maximum tile size along the
585 * last grid dimension.
586 *
587 * The function implements a parallel version of the following snippet:
588 *
589 * for (size_t i = 0; i < range_i; i++)
590 * for (size_t j = 0; j < range_j; j++)
591 * for (size_t k = 0; k < range_k; k += tile_k)
592 * function(context, i, j, k, min(range_k - k, tile_k));
593 *
594 * When the function returns, all items have been processed and the thread pool
595 * is ready for a new task.
596 *
597 * @note If multiple threads call this function with the same thread pool, the
598 * calls are serialized.
599 *
600 * @param threadpool the thread pool to use for parallelisation. If threadpool
601 * is NULL, all items are processed serially on the calling thread.
602 * @param function the function to call for each tile.
603 * @param context the first argument passed to the specified function.
604 * @param range_i the number of items to process along the first dimension
605 * of the 3D grid.
606 * @param range_j the number of items to process along the second dimension
607 * of the 3D grid.
608 * @param range_k the number of items to process along the third dimension
609 * of the 3D grid.
610 * @param tile_k the maximum number of items along the third dimension of
611 * the 3D grid to process in one function call.
612 * @param flags a bitwise combination of zero or more optional flags
613 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
614 */
615 void pthreadpool_parallelize_3d_tile_1d(
616 pthreadpool_t threadpool,
617 pthreadpool_task_3d_tile_1d_t function,
618 void* context,
619 size_t range_i,
620 size_t range_j,
621 size_t range_k,
622 size_t tile_k,
623 uint32_t flags);
624
625 /**
626 * Process items on a 3D grid with the specified maximum tile size along the
627 * last grid dimension and passing along the current thread id.
628 *
629 * The function implements a parallel version of the following snippet:
630 *
631 * for (size_t i = 0; i < range_i; i++)
632 * for (size_t j = 0; j < range_j; j++)
633 * for (size_t k = 0; k < range_k; k += tile_k)
634 * function(context, thread_index, i, j, k, min(range_k - k, tile_k));
635 *
636 * When the function returns, all items have been processed and the thread pool
637 * is ready for a new task.
638 *
639 * @note If multiple threads call this function with the same thread pool, the
640 * calls are serialized.
641 *
642 * @param threadpool the thread pool to use for parallelisation. If threadpool
643 * is NULL, all items are processed serially on the calling thread.
644 * @param function the function to call for each tile.
645 * @param context the first argument passed to the specified function.
646 * @param range_i the number of items to process along the first dimension
647 * of the 3D grid.
648 * @param range_j the number of items to process along the second dimension
649 * of the 3D grid.
650 * @param range_k the number of items to process along the third dimension
651 * of the 3D grid.
652 * @param tile_k the maximum number of items along the third dimension of
653 * the 3D grid to process in one function call.
654 * @param flags a bitwise combination of zero or more optional flags
655 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
656 */
657 void pthreadpool_parallelize_3d_tile_1d_with_thread(
658 pthreadpool_t threadpool,
659 pthreadpool_task_3d_tile_1d_with_thread_t function,
660 void* context,
661 size_t range_i,
662 size_t range_j,
663 size_t range_k,
664 size_t tile_k,
665 uint32_t flags);
666
667 /**
668 * Process items on a 3D grid with the specified maximum tile size along the
669 * last grid dimension using a microarchitecture-aware task function.
670 *
671 * The function implements a parallel version of the following snippet:
672 *
673 * uint32_t uarch_index = cpuinfo_initialize() ?
674 * cpuinfo_get_current_uarch_index() : default_uarch_index;
675 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
676 * for (size_t i = 0; i < range_i; i++)
677 * for (size_t j = 0; j < range_j; j++)
678 * for (size_t k = 0; k < range_k; k += tile_k)
679 * function(context, uarch_index, i, j, k, min(range_k - k, tile_k));
680 *
681 * When the function returns, all items have been processed and the thread pool
682 * is ready for a new task.
683 *
684 * @note If multiple threads call this function with the same thread pool, the
685 * calls are serialized.
686 *
687 * @param threadpool the thread pool to use for parallelisation. If
688 * threadpool is NULL, all items are processed serially on the calling
689 * thread.
690 * @param function the function to call for each tile.
691 * @param context the first argument passed to the specified
692 * function.
693 * @param default_uarch_index the microarchitecture index to use when
694 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
695 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
696 * max_uarch_index value.
697 * @param max_uarch_index the maximum microarchitecture index expected by
698 * the specified function. If the index returned by
699 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
700 * will be used instead. default_uarch_index can exceed max_uarch_index.
701 * @param range_i the number of items to process along the first
702 * dimension of the 3D grid.
703 * @param range_j the number of items to process along the second
704 * dimension of the 3D grid.
705 * @param range_k the number of items to process along the third
706 * dimension of the 3D grid.
707 * @param tile_k the maximum number of items along the third
708 * dimension of the 3D grid to process in one function call.
709 * @param flags a bitwise combination of zero or more optional
710 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
711 * PTHREADPOOL_FLAG_YIELD_WORKERS)
712 */
713 void pthreadpool_parallelize_3d_tile_1d_with_uarch(
714 pthreadpool_t threadpool,
715 pthreadpool_task_3d_tile_1d_with_id_t function,
716 void* context,
717 uint32_t default_uarch_index,
718 uint32_t max_uarch_index,
719 size_t range_i,
720 size_t range_j,
721 size_t range_k,
722 size_t tile_k,
723 uint32_t flags);
724
725 /**
726 * Process items on a 3D grid with the specified maximum tile size along the
727 * last grid dimension using a microarchitecture-aware task function and passing
728 * along the current thread id.
729 *
730 * The function implements a parallel version of the following snippet:
731 *
732 * uint32_t uarch_index = cpuinfo_initialize() ?
733 * cpuinfo_get_current_uarch_index() : default_uarch_index;
734 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
735 * for (size_t i = 0; i < range_i; i++)
736 * for (size_t j = 0; j < range_j; j++)
737 * for (size_t k = 0; k < range_k; k += tile_k)
738 * function(context, uarch_index, thread_index, i, j, k, min(range_k - k, tile_k));
739 *
740 * When the function returns, all items have been processed and the thread pool
741 * is ready for a new task.
742 *
743 * @note If multiple threads call this function with the same thread pool, the
744 * calls are serialized.
745 *
746 * @param threadpool the thread pool to use for parallelisation. If
747 * threadpool is NULL, all items are processed serially on the calling
748 * thread.
749 * @param function the function to call for each tile.
750 * @param context the first argument passed to the specified
751 * function.
752 * @param default_uarch_index the microarchitecture index to use when
753 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
754 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
755 * max_uarch_index value.
756 * @param max_uarch_index the maximum microarchitecture index expected by
757 * the specified function. If the index returned by
758 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
759 * will be used instead. default_uarch_index can exceed max_uarch_index.
760 * @param range_i the number of items to process along the first
761 * dimension of the 3D grid.
762 * @param range_j the number of items to process along the second
763 * dimension of the 3D grid.
764 * @param range_k the number of items to process along the third
765 * dimension of the 3D grid.
766 * @param tile_k the maximum number of items along the third
767 * dimension of the 3D grid to process in one function call.
768 * @param flags a bitwise combination of zero or more optional
769 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
770 * PTHREADPOOL_FLAG_YIELD_WORKERS)
771 */
772 void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(
773 pthreadpool_t threadpool,
774 pthreadpool_task_3d_tile_1d_with_id_with_thread_t function,
775 void* context,
776 uint32_t default_uarch_index,
777 uint32_t max_uarch_index,
778 size_t range_i,
779 size_t range_j,
780 size_t range_k,
781 size_t tile_k,
782 uint32_t flags);
783
784 /**
785 * Process items on a 3D grid with the specified maximum tile size along the
786 * last two grid dimensions.
787 *
788 * The function implements a parallel version of the following snippet:
789 *
790 * for (size_t i = 0; i < range_i; i++)
791 * for (size_t j = 0; j < range_j; j += tile_j)
792 * for (size_t k = 0; k < range_k; k += tile_k)
793 * function(context, i, j, k,
794 * min(range_j - j, tile_j), min(range_k - k, tile_k));
795 *
796 * When the function returns, all items have been processed and the thread pool
797 * is ready for a new task.
798 *
799 * @note If multiple threads call this function with the same thread pool, the
800 * calls are serialized.
801 *
802 * @param threadpool the thread pool to use for parallelisation. If threadpool
803 * is NULL, all items are processed serially on the calling thread.
804 * @param function the function to call for each tile.
805 * @param context the first argument passed to the specified function.
806 * @param range_i the number of items to process along the first dimension
807 * of the 3D grid.
808 * @param range_j the number of items to process along the second dimension
809 * of the 3D grid.
810 * @param range_k the number of items to process along the third dimension
811 * of the 3D grid.
812 * @param tile_j the maximum number of items along the second dimension of
813 * the 3D grid to process in one function call.
814 * @param tile_k the maximum number of items along the third dimension of
815 * the 3D grid to process in one function call.
816 * @param flags a bitwise combination of zero or more optional flags
817 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
818 */
819 void pthreadpool_parallelize_3d_tile_2d(
820 pthreadpool_t threadpool,
821 pthreadpool_task_3d_tile_2d_t function,
822 void* context,
823 size_t range_i,
824 size_t range_j,
825 size_t range_k,
826 size_t tile_j,
827 size_t tile_k,
828 uint32_t flags);
829
830 /**
831 * Process items on a 3D grid with the specified maximum tile size along the
832 * last two grid dimensions using a microarchitecture-aware task function.
833 *
834 * The function implements a parallel version of the following snippet:
835 *
836 * uint32_t uarch_index = cpuinfo_initialize() ?
837 * cpuinfo_get_current_uarch_index() : default_uarch_index;
838 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
839 * for (size_t i = 0; i < range_i; i++)
840 * for (size_t j = 0; j < range_j; j += tile_j)
841 * for (size_t k = 0; k < range_k; k += tile_k)
842 * function(context, uarch_index, i, j, k,
843 * min(range_j - j, tile_j), min(range_k - k, tile_k));
844 *
845 * When the function returns, all items have been processed and the thread pool
846 * is ready for a new task.
847 *
848 * @note If multiple threads call this function with the same thread pool, the
849 * calls are serialized.
850 *
851 * @param threadpool the thread pool to use for parallelisation. If
852 * threadpool is NULL, all items are processed serially on the calling
853 * thread.
854 * @param function the function to call for each tile.
855 * @param context the first argument passed to the specified
856 * function.
857 * @param default_uarch_index the microarchitecture index to use when
858 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
859 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
860 * max_uarch_index value.
861 * @param max_uarch_index the maximum microarchitecture index expected by
862 * the specified function. If the index returned by
863 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
864 * will be used instead. default_uarch_index can exceed max_uarch_index.
865 * @param range_i the number of items to process along the first
866 * dimension of the 3D grid.
867 * @param range_j the number of items to process along the second
868 * dimension of the 3D grid.
869 * @param range_k the number of items to process along the third
870 * dimension of the 3D grid.
871 * @param tile_j the maximum number of items along the second
872 * dimension of the 3D grid to process in one function call.
873 * @param tile_k the maximum number of items along the third
874 * dimension of the 3D grid to process in one function call.
875 * @param flags a bitwise combination of zero or more optional
876 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
877 * PTHREADPOOL_FLAG_YIELD_WORKERS)
878 */
879 void pthreadpool_parallelize_3d_tile_2d_with_uarch(
880 pthreadpool_t threadpool,
881 pthreadpool_task_3d_tile_2d_with_id_t function,
882 void* context,
883 uint32_t default_uarch_index,
884 uint32_t max_uarch_index,
885 size_t range_i,
886 size_t range_j,
887 size_t range_k,
888 size_t tile_j,
889 size_t tile_k,
890 uint32_t flags);
891
892 /**
893 * Process items on a 4D grid.
894 *
895 * The function implements a parallel version of the following snippet:
896 *
897 * for (size_t i = 0; i < range_i; i++)
898 * for (size_t j = 0; j < range_j; j++)
899 * for (size_t k = 0; k < range_k; k++)
900 * for (size_t l = 0; l < range_l; l++)
901 * function(context, i, j, k, l);
902 *
903 * When the function returns, all items have been processed and the thread pool
904 * is ready for a new task.
905 *
906 * @note If multiple threads call this function with the same thread pool, the
907 * calls are serialized.
908 *
909 * @param threadpool the thread pool to use for parallelisation. If threadpool
910 * is NULL, all items are processed serially on the calling thread.
911 * @param function the function to call for each tile.
912 * @param context the first argument passed to the specified function.
913 * @param range_i the number of items to process along the first dimension
914 * of the 4D grid.
915 * @param range_j the number of items to process along the second dimension
916 * of the 4D grid.
917 * @param range_k the number of items to process along the third dimension
918 * of the 4D grid.
919 * @param range_l the number of items to process along the fourth dimension
920 * of the 4D grid.
921 * @param flags a bitwise combination of zero or more optional flags
922 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
923 */
924 void pthreadpool_parallelize_4d(
925 pthreadpool_t threadpool,
926 pthreadpool_task_4d_t function,
927 void* context,
928 size_t range_i,
929 size_t range_j,
930 size_t range_k,
931 size_t range_l,
932 uint32_t flags);
933
934 /**
935 * Process items on a 4D grid with the specified maximum tile size along the
936 * last grid dimension.
937 *
938 * The function implements a parallel version of the following snippet:
939 *
940 * for (size_t i = 0; i < range_i; i++)
941 * for (size_t j = 0; j < range_j; j++)
942 * for (size_t k = 0; k < range_k; k++)
943 * for (size_t l = 0; l < range_l; l += tile_l)
944 * function(context, i, j, k, l, min(range_l - l, tile_l));
945 *
946 * When the function returns, all items have been processed and the thread pool
947 * is ready for a new task.
948 *
949 * @note If multiple threads call this function with the same thread pool, the
950 * calls are serialized.
951 *
952 * @param threadpool the thread pool to use for parallelisation. If threadpool
953 * is NULL, all items are processed serially on the calling thread.
954 * @param function the function to call for each tile.
955 * @param context the first argument passed to the specified function.
956 * @param range_i the number of items to process along the first dimension
957 * of the 4D grid.
958 * @param range_j the number of items to process along the second dimension
959 * of the 4D grid.
960 * @param range_k the number of items to process along the third dimension
961 * of the 4D grid.
962 * @param range_l the number of items to process along the fourth dimension
963 * of the 4D grid.
964 * @param tile_l the maximum number of items along the fourth dimension of
965 * the 4D grid to process in one function call.
966 * @param flags a bitwise combination of zero or more optional flags
967 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
968 */
969 void pthreadpool_parallelize_4d_tile_1d(
970 pthreadpool_t threadpool,
971 pthreadpool_task_4d_tile_1d_t function,
972 void* context,
973 size_t range_i,
974 size_t range_j,
975 size_t range_k,
976 size_t range_l,
977 size_t tile_l,
978 uint32_t flags);
979
980 /**
981 * Process items on a 4D grid with the specified maximum tile size along the
982 * last two grid dimensions.
983 *
984 * The function implements a parallel version of the following snippet:
985 *
986 * for (size_t i = 0; i < range_i; i++)
987 * for (size_t j = 0; j < range_j; j++)
988 * for (size_t k = 0; k < range_k; k += tile_k)
989 * for (size_t l = 0; l < range_l; l += tile_l)
990 * function(context, i, j, k, l,
991 * min(range_k - k, tile_k), min(range_l - l, tile_l));
992 *
993 * When the function returns, all items have been processed and the thread pool
994 * is ready for a new task.
995 *
996 * @note If multiple threads call this function with the same thread pool, the
997 * calls are serialized.
998 *
999 * @param threadpool the thread pool to use for parallelisation. If threadpool
1000 * is NULL, all items are processed serially on the calling thread.
1001 * @param function the function to call for each tile.
1002 * @param context the first argument passed to the specified function.
1003 * @param range_i the number of items to process along the first dimension
1004 * of the 4D grid.
1005 * @param range_j the number of items to process along the second dimension
1006 * of the 4D grid.
1007 * @param range_k the number of items to process along the third dimension
1008 * of the 4D grid.
1009 * @param range_l the number of items to process along the fourth dimension
1010 * of the 4D grid.
1011 * @param tile_k the maximum number of items along the third dimension of
1012 * the 4D grid to process in one function call.
1013 * @param tile_l the maximum number of items along the fourth dimension of
1014 * the 4D grid to process in one function call.
1015 * @param flags a bitwise combination of zero or more optional flags
1016 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1017 */
1018 void pthreadpool_parallelize_4d_tile_2d(
1019 pthreadpool_t threadpool,
1020 pthreadpool_task_4d_tile_2d_t function,
1021 void* context,
1022 size_t range_i,
1023 size_t range_j,
1024 size_t range_k,
1025 size_t range_l,
1026 size_t tile_k,
1027 size_t tile_l,
1028 uint32_t flags);
1029
1030 /**
1031 * Process items on a 4D grid with the specified maximum tile size along the
1032 * last two grid dimensions using a microarchitecture-aware task function.
1033 *
1034 * The function implements a parallel version of the following snippet:
1035 *
1036 * uint32_t uarch_index = cpuinfo_initialize() ?
1037 * cpuinfo_get_current_uarch_index() : default_uarch_index;
1038 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
1039 * for (size_t i = 0; i < range_i; i++)
1040 * for (size_t j = 0; j < range_j; j++)
1041 * for (size_t k = 0; k < range_k; k += tile_k)
1042 * for (size_t l = 0; l < range_l; l += tile_l)
1043 * function(context, uarch_index, i, j, k, l,
1044 * min(range_k - k, tile_k), min(range_l - l, tile_l));
1045 *
1046 * When the function returns, all items have been processed and the thread pool
1047 * is ready for a new task.
1048 *
1049 * @note If multiple threads call this function with the same thread pool, the
1050 * calls are serialized.
1051 *
1052 * @param threadpool the thread pool to use for parallelisation. If
1053 * threadpool is NULL, all items are processed serially on the calling
1054 * thread.
1055 * @param function the function to call for each tile.
1056 * @param context the first argument passed to the specified
1057 * function.
1058 * @param default_uarch_index the microarchitecture index to use when
1059 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
1060 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
1061 * max_uarch_index value.
1062 * @param max_uarch_index the maximum microarchitecture index expected by
1063 * the specified function. If the index returned by
1064 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
1065 * will be used instead. default_uarch_index can exceed max_uarch_index.
1066 * @param range_i the number of items to process along the first
1067 * dimension of the 4D grid.
1068 * @param range_j the number of items to process along the second
1069 * dimension of the 4D grid.
1070 * @param range_k the number of items to process along the third
1071 * dimension of the 4D grid.
1072 * @param range_l the number of items to process along the fourth
1073 * dimension of the 4D grid.
1074 * @param tile_k the maximum number of items along the third
1075 * dimension of the 4D grid to process in one function call.
1076 * @param tile_l the maximum number of items along the fourth
1077 * dimension of the 4D grid to process in one function call.
1078 * @param flags a bitwise combination of zero or more optional
1079 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
1080 * PTHREADPOOL_FLAG_YIELD_WORKERS)
1081 */
1082 void pthreadpool_parallelize_4d_tile_2d_with_uarch(
1083 pthreadpool_t threadpool,
1084 pthreadpool_task_4d_tile_2d_with_id_t function,
1085 void* context,
1086 uint32_t default_uarch_index,
1087 uint32_t max_uarch_index,
1088 size_t range_i,
1089 size_t range_j,
1090 size_t range_k,
1091 size_t range_l,
1092 size_t tile_k,
1093 size_t tile_l,
1094 uint32_t flags);
1095
1096 /**
1097 * Process items on a 5D grid.
1098 *
1099 * The function implements a parallel version of the following snippet:
1100 *
1101 * for (size_t i = 0; i < range_i; i++)
1102 * for (size_t j = 0; j < range_j; j++)
1103 * for (size_t k = 0; k < range_k; k++)
1104 * for (size_t l = 0; l < range_l; l++)
1105 * for (size_t m = 0; m < range_m; m++)
1106 * function(context, i, j, k, l, m);
1107 *
1108 * When the function returns, all items have been processed and the thread pool
1109 * is ready for a new task.
1110 *
1111 * @note If multiple threads call this function with the same thread pool, the
1112 * calls are serialized.
1113 *
1114 * @param threadpool the thread pool to use for parallelisation. If threadpool
1115 * is NULL, all items are processed serially on the calling thread.
1116 * @param function the function to call for each tile.
1117 * @param context the first argument passed to the specified function.
1118 * @param range_i the number of items to process along the first dimension
1119 * of the 5D grid.
1120 * @param range_j the number of items to process along the second dimension
1121 * of the 5D grid.
1122 * @param range_k the number of items to process along the third dimension
1123 * of the 5D grid.
1124 * @param range_l the number of items to process along the fourth dimension
1125 * of the 5D grid.
1126 * @param range_m the number of items to process along the fifth dimension
1127 * of the 5D grid.
1128 * @param flags a bitwise combination of zero or more optional flags
1129 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1130 */
1131 void pthreadpool_parallelize_5d(
1132 pthreadpool_t threadpool,
1133 pthreadpool_task_5d_t function,
1134 void* context,
1135 size_t range_i,
1136 size_t range_j,
1137 size_t range_k,
1138 size_t range_l,
1139 size_t range_m,
1140 uint32_t flags);
1141
1142 /**
1143 * Process items on a 5D grid with the specified maximum tile size along the
1144 * last grid dimension.
1145 *
1146 * The function implements a parallel version of the following snippet:
1147 *
1148 * for (size_t i = 0; i < range_i; i++)
1149 * for (size_t j = 0; j < range_j; j++)
1150 * for (size_t k = 0; k < range_k; k++)
1151 * for (size_t l = 0; l < range_l; l++)
1152 * for (size_t m = 0; m < range_m; m += tile_m)
1153 * function(context, i, j, k, l, m, min(range_m - m, tile_m));
1154 *
1155 * When the function returns, all items have been processed and the thread pool
1156 * is ready for a new task.
1157 *
1158 * @note If multiple threads call this function with the same thread pool, the
1159 * calls are serialized.
1160 *
1161 * @param threadpool the thread pool to use for parallelisation. If threadpool
1162 * is NULL, all items are processed serially on the calling thread.
1163 * @param function the function to call for each tile.
1164 * @param context the first argument passed to the specified function.
1165 * @param range_i the number of items to process along the first dimension
1166 * of the 5D grid.
1167 * @param range_j the number of items to process along the second dimension
1168 * of the 5D grid.
1169 * @param range_k the number of items to process along the third dimension
1170 * of the 5D grid.
1171 * @param range_l the number of items to process along the fourth dimension
1172 * of the 5D grid.
1173 * @param range_m the number of items to process along the fifth dimension
1174 * of the 5D grid.
1175 * @param tile_m the maximum number of items along the fifth dimension of
1176 * the 5D grid to process in one function call.
1177 * @param flags a bitwise combination of zero or more optional flags
1178 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1179 */
1180 void pthreadpool_parallelize_5d_tile_1d(
1181 pthreadpool_t threadpool,
1182 pthreadpool_task_5d_tile_1d_t function,
1183 void* context,
1184 size_t range_i,
1185 size_t range_j,
1186 size_t range_k,
1187 size_t range_l,
1188 size_t range_m,
1189 size_t tile_m,
1190 uint32_t flags);
1191
1192 /**
1193 * Process items on a 5D grid with the specified maximum tile size along the
1194 * last two grid dimensions.
1195 *
1196 * The function implements a parallel version of the following snippet:
1197 *
1198 * for (size_t i = 0; i < range_i; i++)
1199 * for (size_t j = 0; j < range_j; j++)
1200 * for (size_t k = 0; k < range_k; k++)
1201 * for (size_t l = 0; l < range_l; l += tile_l)
1202 * for (size_t m = 0; m < range_m; m += tile_m)
1203 * function(context, i, j, k, l, m,
1204 * min(range_l - l, tile_l), min(range_m - m, tile_m));
1205 *
1206 * When the function returns, all items have been processed and the thread pool
1207 * is ready for a new task.
1208 *
1209 * @note If multiple threads call this function with the same thread pool, the
1210 * calls are serialized.
1211 *
1212 * @param threadpool the thread pool to use for parallelisation. If threadpool
1213 * is NULL, all items are processed serially on the calling thread.
1214 * @param function the function to call for each tile.
1215 * @param context the first argument passed to the specified function.
1216 * @param range_i the number of items to process along the first dimension
1217 * of the 5D grid.
1218 * @param range_j the number of items to process along the second dimension
1219 * of the 5D grid.
1220 * @param range_k the number of items to process along the third dimension
1221 * of the 5D grid.
1222 * @param range_l the number of items to process along the fourth dimension
1223 * of the 5D grid.
1224 * @param range_m the number of items to process along the fifth dimension
1225 * of the 5D grid.
1226 * @param tile_l the maximum number of items along the fourth dimension of
1227 * the 5D grid to process in one function call.
1228 * @param tile_m the maximum number of items along the fifth dimension of
1229 * the 5D grid to process in one function call.
1230 * @param flags a bitwise combination of zero or more optional flags
1231 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1232 */
1233 void pthreadpool_parallelize_5d_tile_2d(
1234 pthreadpool_t threadpool,
1235 pthreadpool_task_5d_tile_2d_t function,
1236 void* context,
1237 size_t range_i,
1238 size_t range_j,
1239 size_t range_k,
1240 size_t range_l,
1241 size_t range_m,
1242 size_t tile_l,
1243 size_t tile_m,
1244 uint32_t flags);
1245
1246 /**
1247 * Process items on a 6D grid.
1248 *
1249 * The function implements a parallel version of the following snippet:
1250 *
1251 * for (size_t i = 0; i < range_i; i++)
1252 * for (size_t j = 0; j < range_j; j++)
1253 * for (size_t k = 0; k < range_k; k++)
1254 * for (size_t l = 0; l < range_l; l++)
1255 * for (size_t m = 0; m < range_m; m++)
1256 * for (size_t n = 0; n < range_n; n++)
1257 * function(context, i, j, k, l, m, n);
1258 *
1259 * When the function returns, all items have been processed and the thread pool
1260 * is ready for a new task.
1261 *
1262 * @note If multiple threads call this function with the same thread pool, the
1263 * calls are serialized.
1264 *
1265 * @param threadpool the thread pool to use for parallelisation. If threadpool
1266 * is NULL, all items are processed serially on the calling thread.
1267 * @param function the function to call for each tile.
1268 * @param context the first argument passed to the specified function.
1269 * @param range_i the number of items to process along the first dimension
1270 * of the 6D grid.
1271 * @param range_j the number of items to process along the second dimension
1272 * of the 6D grid.
1273 * @param range_k the number of items to process along the third dimension
1274 * of the 6D grid.
1275 * @param range_l the number of items to process along the fourth dimension
1276 * of the 6D grid.
1277 * @param range_m the number of items to process along the fifth dimension
1278 * of the 6D grid.
1279 * @param range_n the number of items to process along the sixth dimension
1280 * of the 6D grid.
1281 * @param tile_n the maximum number of items along the sixth dimension of
1282 * the 6D grid to process in one function call.
1283 * @param flags a bitwise combination of zero or more optional flags
1284 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1285 */
1286 void pthreadpool_parallelize_6d(
1287 pthreadpool_t threadpool,
1288 pthreadpool_task_6d_t function,
1289 void* context,
1290 size_t range_i,
1291 size_t range_j,
1292 size_t range_k,
1293 size_t range_l,
1294 size_t range_m,
1295 size_t range_n,
1296 uint32_t flags);
1297
1298 /**
1299 * Process items on a 6D grid with the specified maximum tile size along the
1300 * last grid dimension.
1301 *
1302 * The function implements a parallel version of the following snippet:
1303 *
1304 * for (size_t i = 0; i < range_i; i++)
1305 * for (size_t j = 0; j < range_j; j++)
1306 * for (size_t k = 0; k < range_k; k++)
1307 * for (size_t l = 0; l < range_l; l++)
1308 * for (size_t m = 0; m < range_m; m++)
1309 * for (size_t n = 0; n < range_n; n += tile_n)
1310 * function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
1311 *
1312 * When the function returns, all items have been processed and the thread pool
1313 * is ready for a new task.
1314 *
1315 * @note If multiple threads call this function with the same thread pool, the
1316 * calls are serialized.
1317 *
1318 * @param threadpool the thread pool to use for parallelisation. If threadpool
1319 * is NULL, all items are processed serially on the calling thread.
1320 * @param function the function to call for each tile.
1321 * @param context the first argument passed to the specified function.
1322 * @param range_i the number of items to process along the first dimension
1323 * of the 6D grid.
1324 * @param range_j the number of items to process along the second dimension
1325 * of the 6D grid.
1326 * @param range_k the number of items to process along the third dimension
1327 * of the 6D grid.
1328 * @param range_l the number of items to process along the fourth dimension
1329 * of the 6D grid.
1330 * @param range_m the number of items to process along the fifth dimension
1331 * of the 6D grid.
1332 * @param range_n the number of items to process along the sixth dimension
1333 * of the 6D grid.
1334 * @param tile_n the maximum number of items along the sixth dimension of
1335 * the 6D grid to process in one function call.
1336 * @param flags a bitwise combination of zero or more optional flags
1337 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1338 */
1339 void pthreadpool_parallelize_6d_tile_1d(
1340 pthreadpool_t threadpool,
1341 pthreadpool_task_6d_tile_1d_t function,
1342 void* context,
1343 size_t range_i,
1344 size_t range_j,
1345 size_t range_k,
1346 size_t range_l,
1347 size_t range_m,
1348 size_t range_n,
1349 size_t tile_n,
1350 uint32_t flags);
1351
1352 /**
1353 * Process items on a 6D grid with the specified maximum tile size along the
1354 * last two grid dimensions.
1355 *
1356 * The function implements a parallel version of the following snippet:
1357 *
1358 * for (size_t i = 0; i < range_i; i++)
1359 * for (size_t j = 0; j < range_j; j++)
1360 * for (size_t k = 0; k < range_k; k++)
1361 * for (size_t l = 0; l < range_l; l++)
1362 * for (size_t m = 0; m < range_m; m += tile_m)
1363 * for (size_t n = 0; n < range_n; n += tile_n)
1364 * function(context, i, j, k, l, m, n,
1365 * min(range_m - m, tile_m), min(range_n - n, tile_n));
1366 *
1367 * When the function returns, all items have been processed and the thread pool
1368 * is ready for a new task.
1369 *
1370 * @note If multiple threads call this function with the same thread pool, the
1371 * calls are serialized.
1372 *
1373 * @param threadpool the thread pool to use for parallelisation. If threadpool
1374 * is NULL, all items are processed serially on the calling thread.
1375 * @param function the function to call for each tile.
1376 * @param context the first argument passed to the specified function.
1377 * @param range_i the number of items to process along the first dimension
1378 * of the 6D grid.
1379 * @param range_j the number of items to process along the second dimension
1380 * of the 6D grid.
1381 * @param range_k the number of items to process along the third dimension
1382 * of the 6D grid.
1383 * @param range_l the number of items to process along the fourth dimension
1384 * of the 6D grid.
1385 * @param range_m the number of items to process along the fifth dimension
1386 * of the 6D grid.
1387 * @param range_n the number of items to process along the sixth dimension
1388 * of the 6D grid.
1389 * @param tile_m the maximum number of items along the fifth dimension of
1390 * the 6D grid to process in one function call.
1391 * @param tile_n the maximum number of items along the sixth dimension of
1392 * the 6D grid to process in one function call.
1393 * @param flags a bitwise combination of zero or more optional flags
1394 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1395 */
1396 void pthreadpool_parallelize_6d_tile_2d(
1397 pthreadpool_t threadpool,
1398 pthreadpool_task_6d_tile_2d_t function,
1399 void* context,
1400 size_t range_i,
1401 size_t range_j,
1402 size_t range_k,
1403 size_t range_l,
1404 size_t range_m,
1405 size_t range_n,
1406 size_t tile_m,
1407 size_t tile_n,
1408 uint32_t flags);
1409
1410 /**
1411 * Terminates threads in the thread pool and releases associated resources.
1412 *
1413 * @warning Accessing the thread pool after a call to this function constitutes
1414 * undefined behaviour and may cause data corruption.
1415 *
1416 * @param[in,out] threadpool The thread pool to destroy.
1417 */
1418 void pthreadpool_destroy(pthreadpool_t threadpool);
1419
1420 #ifndef PTHREADPOOL_NO_DEPRECATED_API
1421
1422 /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
1423 #if defined(__GNUC__)
1424 #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
1425 #else
1426 #define PTHREADPOOL_DEPRECATED
1427 #endif
1428
1429 typedef void (*pthreadpool_function_1d_t)(void*, size_t);
1430 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
1431 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
1432 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
1433 typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
1434 typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
1435
1436 void pthreadpool_compute_1d(
1437 pthreadpool_t threadpool,
1438 pthreadpool_function_1d_t function,
1439 void* argument,
1440 size_t range) PTHREADPOOL_DEPRECATED;
1441
1442 void pthreadpool_compute_1d_tiled(
1443 pthreadpool_t threadpool,
1444 pthreadpool_function_1d_tiled_t function,
1445 void* argument,
1446 size_t range,
1447 size_t tile) PTHREADPOOL_DEPRECATED;
1448
1449 void pthreadpool_compute_2d(
1450 pthreadpool_t threadpool,
1451 pthreadpool_function_2d_t function,
1452 void* argument,
1453 size_t range_i,
1454 size_t range_j) PTHREADPOOL_DEPRECATED;
1455
1456 void pthreadpool_compute_2d_tiled(
1457 pthreadpool_t threadpool,
1458 pthreadpool_function_2d_tiled_t function,
1459 void* argument,
1460 size_t range_i,
1461 size_t range_j,
1462 size_t tile_i,
1463 size_t tile_j) PTHREADPOOL_DEPRECATED;
1464
1465 void pthreadpool_compute_3d_tiled(
1466 pthreadpool_t threadpool,
1467 pthreadpool_function_3d_tiled_t function,
1468 void* argument,
1469 size_t range_i,
1470 size_t range_j,
1471 size_t range_k,
1472 size_t tile_i,
1473 size_t tile_j,
1474 size_t tile_k) PTHREADPOOL_DEPRECATED;
1475
1476 void pthreadpool_compute_4d_tiled(
1477 pthreadpool_t threadpool,
1478 pthreadpool_function_4d_tiled_t function,
1479 void* argument,
1480 size_t range_i,
1481 size_t range_j,
1482 size_t range_k,
1483 size_t range_l,
1484 size_t tile_i,
1485 size_t tile_j,
1486 size_t tile_k,
1487 size_t tile_l) PTHREADPOOL_DEPRECATED;
1488
1489 #endif /* PTHREADPOOL_NO_DEPRECATED_API */
1490
1491 #ifdef __cplusplus
1492 } /* extern "C" */
1493 #endif
1494
1495 #ifdef __cplusplus
1496
1497 namespace libpthreadpool {
1498 namespace detail {
1499 namespace {
1500
1501 template<class T>
call_wrapper_1d(void * arg,size_t i)1502 void call_wrapper_1d(void* arg, size_t i) {
1503 (*static_cast<const T*>(arg))(i);
1504 }
1505
1506 template<class T>
call_wrapper_1d_tile_1d(void * arg,size_t range_i,size_t tile_i)1507 void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) {
1508 (*static_cast<const T*>(arg))(range_i, tile_i);
1509 }
1510
1511 template<class T>
call_wrapper_2d(void * functor,size_t i,size_t j)1512 void call_wrapper_2d(void* functor, size_t i, size_t j) {
1513 (*static_cast<const T*>(functor))(i, j);
1514 }
1515
1516 template<class T>
call_wrapper_2d_tile_1d(void * functor,size_t i,size_t range_j,size_t tile_j)1517 void call_wrapper_2d_tile_1d(void* functor,
1518 size_t i, size_t range_j, size_t tile_j)
1519 {
1520 (*static_cast<const T*>(functor))(i, range_j, tile_j);
1521 }
1522
1523 template<class T>
call_wrapper_2d_tile_2d(void * functor,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j)1524 void call_wrapper_2d_tile_2d(void* functor,
1525 size_t range_i, size_t range_j,
1526 size_t tile_i, size_t tile_j)
1527 {
1528 (*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j);
1529 }
1530
1531 template<class T>
call_wrapper_3d(void * functor,size_t i,size_t j,size_t k)1532 void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) {
1533 (*static_cast<const T*>(functor))(i, j, k);
1534 }
1535
1536 template<class T>
call_wrapper_3d_tile_1d(void * functor,size_t i,size_t j,size_t range_k,size_t tile_k)1537 void call_wrapper_3d_tile_1d(void* functor,
1538 size_t i, size_t j, size_t range_k,
1539 size_t tile_k)
1540 {
1541 (*static_cast<const T*>(functor))(i, j, range_k, tile_k);
1542 }
1543
1544 template<class T>
call_wrapper_3d_tile_2d(void * functor,size_t i,size_t range_j,size_t range_k,size_t tile_j,size_t tile_k)1545 void call_wrapper_3d_tile_2d(void* functor,
1546 size_t i, size_t range_j, size_t range_k,
1547 size_t tile_j, size_t tile_k)
1548 {
1549 (*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k);
1550 }
1551
1552 template<class T>
call_wrapper_4d(void * functor,size_t i,size_t j,size_t k,size_t l)1553 void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) {
1554 (*static_cast<const T*>(functor))(i, j, k, l);
1555 }
1556
1557 template<class T>
call_wrapper_4d_tile_1d(void * functor,size_t i,size_t j,size_t k,size_t range_l,size_t tile_l)1558 void call_wrapper_4d_tile_1d(void* functor,
1559 size_t i, size_t j, size_t k, size_t range_l,
1560 size_t tile_l)
1561 {
1562 (*static_cast<const T*>(functor))(i, j, k, range_l, tile_l);
1563 }
1564
1565 template<class T>
call_wrapper_4d_tile_2d(void * functor,size_t i,size_t j,size_t range_k,size_t range_l,size_t tile_k,size_t tile_l)1566 void call_wrapper_4d_tile_2d(void* functor,
1567 size_t i, size_t j, size_t range_k, size_t range_l,
1568 size_t tile_k, size_t tile_l)
1569 {
1570 (*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l);
1571 }
1572
1573 template<class T>
call_wrapper_5d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t m)1574 void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) {
1575 (*static_cast<const T*>(functor))(i, j, k, l, m);
1576 }
1577
1578 template<class T>
call_wrapper_5d_tile_1d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t range_m,size_t tile_m)1579 void call_wrapper_5d_tile_1d(void* functor,
1580 size_t i, size_t j, size_t k, size_t l, size_t range_m,
1581 size_t tile_m)
1582 {
1583 (*static_cast<const T*>(functor))(i, j, k, l, range_m, tile_m);
1584 }
1585
1586 template<class T>
call_wrapper_5d_tile_2d(void * functor,size_t i,size_t j,size_t k,size_t range_l,size_t range_m,size_t tile_l,size_t tile_m)1587 void call_wrapper_5d_tile_2d(void* functor,
1588 size_t i, size_t j, size_t k, size_t range_l, size_t range_m,
1589 size_t tile_l, size_t tile_m)
1590 {
1591 (*static_cast<const T*>(functor))(i, j, k, range_l, range_m, tile_l, tile_m);
1592 }
1593
1594 template<class T>
call_wrapper_6d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)1595 void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
1596 (*static_cast<const T*>(functor))(i, j, k, l, m, n);
1597 }
1598
1599 template<class T>
call_wrapper_6d_tile_1d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t m,size_t range_n,size_t tile_n)1600 void call_wrapper_6d_tile_1d(void* functor,
1601 size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n,
1602 size_t tile_n)
1603 {
1604 (*static_cast<const T*>(functor))(i, j, k, l, m, range_n, tile_n);
1605 }
1606
1607 template<class T>
call_wrapper_6d_tile_2d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t range_m,size_t range_n,size_t tile_m,size_t tile_n)1608 void call_wrapper_6d_tile_2d(void* functor,
1609 size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n,
1610 size_t tile_m, size_t tile_n)
1611 {
1612 (*static_cast<const T*>(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n);
1613 }
1614
1615 } /* namespace */
1616 } /* namespace detail */
1617 } /* namespace libpthreadpool */
1618
1619 /**
1620 * Process items on a 1D grid.
1621 *
1622 * The function implements a parallel version of the following snippet:
1623 *
1624 * for (size_t i = 0; i < range; i++)
1625 * functor(i);
1626 *
1627 * When the function returns, all items have been processed and the thread pool
1628 * is ready for a new task.
1629 *
1630 * @note If multiple threads call this function with the same thread pool, the
1631 * calls are serialized.
1632 *
1633 * @param threadpool the thread pool to use for parallelisation. If threadpool
1634 * is NULL, all items are processed serially on the calling thread.
1635 * @param functor the functor to call for each item.
1636 * @param range the number of items on the 1D grid to process. The
1637 * specified functor will be called once for each item.
1638 * @param flags a bitwise combination of zero or more optional flags
1639 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1640 */
1641 template<class T>
1642 inline void pthreadpool_parallelize_1d(
1643 pthreadpool_t threadpool,
1644 const T& functor,
1645 size_t range,
1646 uint32_t flags = 0)
1647 {
1648 pthreadpool_parallelize_1d(
1649 threadpool,
1650 &libpthreadpool::detail::call_wrapper_1d<const T>,
1651 const_cast<void*>(static_cast<const void*>(&functor)),
1652 range,
1653 flags);
1654 }
1655
1656 /**
1657 * Process items on a 1D grid with specified maximum tile size.
1658 *
1659 * The function implements a parallel version of the following snippet:
1660 *
1661 * for (size_t i = 0; i < range; i += tile)
1662 * functor(i, min(range - i, tile));
1663 *
1664 * When the call returns, all items have been processed and the thread pool is
1665 * ready for a new task.
1666 *
1667 * @note If multiple threads call this function with the same thread pool,
1668 * the calls are serialized.
1669 *
1670 * @param threadpool the thread pool to use for parallelisation. If threadpool
1671 * is NULL, all items are processed serially on the calling thread.
1672 * @param functor the functor to call for each tile.
1673 * @param range the number of items on the 1D grid to process.
1674 * @param tile the maximum number of items on the 1D grid to process in
1675 * one functor call.
1676 * @param flags a bitwise combination of zero or more optional flags
1677 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1678 */
1679 template<class T>
1680 inline void pthreadpool_parallelize_1d_tile_1d(
1681 pthreadpool_t threadpool,
1682 const T& functor,
1683 size_t range,
1684 size_t tile,
1685 uint32_t flags = 0)
1686 {
1687 pthreadpool_parallelize_1d_tile_1d(
1688 threadpool,
1689 &libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>,
1690 const_cast<void*>(static_cast<const void*>(&functor)),
1691 range,
1692 tile,
1693 flags);
1694 }
1695
1696 /**
1697 * Process items on a 2D grid.
1698 *
1699 * The function implements a parallel version of the following snippet:
1700 *
1701 * for (size_t i = 0; i < range_i; i++)
1702 * for (size_t j = 0; j < range_j; j++)
1703 * functor(i, j);
1704 *
1705 * When the function returns, all items have been processed and the thread pool
1706 * is ready for a new task.
1707 *
1708 * @note If multiple threads call this function with the same thread pool, the
1709 * calls are serialized.
1710 *
1711 * @param threadpool the thread pool to use for parallelisation. If threadpool
1712 * is NULL, all items are processed serially on the calling thread.
1713 * @param functor the functor to call for each item.
1714 * @param range_i the number of items to process along the first dimension
1715 * of the 2D grid.
1716 * @param range_j the number of items to process along the second dimension
1717 * of the 2D grid.
1718 * @param flags a bitwise combination of zero or more optional flags
1719 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1720 */
1721 template<class T>
1722 inline void pthreadpool_parallelize_2d(
1723 pthreadpool_t threadpool,
1724 const T& functor,
1725 size_t range_i,
1726 size_t range_j,
1727 uint32_t flags = 0)
1728 {
1729 pthreadpool_parallelize_2d(
1730 threadpool,
1731 &libpthreadpool::detail::call_wrapper_2d<const T>,
1732 const_cast<void*>(static_cast<const void*>(&functor)),
1733 range_i,
1734 range_j,
1735 flags);
1736 }
1737
1738 /**
1739 * Process items on a 2D grid with the specified maximum tile size along the
1740 * last grid dimension.
1741 *
1742 * The function implements a parallel version of the following snippet:
1743 *
1744 * for (size_t i = 0; i < range_i; i++)
1745 * for (size_t j = 0; j < range_j; j += tile_j)
1746 * functor(i, j, min(range_j - j, tile_j));
1747 *
1748 * When the function returns, all items have been processed and the thread pool
1749 * is ready for a new task.
1750 *
1751 * @note If multiple threads call this function with the same thread pool, the
1752 * calls are serialized.
1753 *
1754 * @param threadpool the thread pool to use for parallelisation. If threadpool
1755 * is NULL, all items are processed serially on the calling thread.
1756 * @param functor the functor to call for each tile.
1757 * @param range_i the number of items to process along the first dimension
1758 * of the 2D grid.
1759 * @param range_j the number of items to process along the second dimension
1760 * of the 2D grid.
1761 * @param tile_j the maximum number of items along the second dimension of
1762 * the 2D grid to process in one functor call.
1763 * @param flags a bitwise combination of zero or more optional flags
1764 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1765 */
1766 template<class T>
1767 inline void pthreadpool_parallelize_2d_tile_1d(
1768 pthreadpool_t threadpool,
1769 const T& functor,
1770 size_t range_i,
1771 size_t range_j,
1772 size_t tile_j,
1773 uint32_t flags = 0)
1774 {
1775 pthreadpool_parallelize_2d_tile_1d(
1776 threadpool,
1777 &libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>,
1778 const_cast<void*>(static_cast<const void*>(&functor)),
1779 range_i,
1780 range_j,
1781 tile_j,
1782 flags);
1783 }
1784
1785 /**
1786 * Process items on a 2D grid with the specified maximum tile size along each
1787 * grid dimension.
1788 *
1789 * The function implements a parallel version of the following snippet:
1790 *
1791 * for (size_t i = 0; i < range_i; i += tile_i)
1792 * for (size_t j = 0; j < range_j; j += tile_j)
1793 * functor(i, j,
1794 * min(range_i - i, tile_i), min(range_j - j, tile_j));
1795 *
1796 * When the function returns, all items have been processed and the thread pool
1797 * is ready for a new task.
1798 *
1799 * @note If multiple threads call this function with the same thread pool, the
1800 * calls are serialized.
1801 *
1802 * @param threadpool the thread pool to use for parallelisation. If threadpool
1803 * is NULL, all items are processed serially on the calling thread.
1804 * @param functor the functor to call for each tile.
1805 * @param range_i the number of items to process along the first dimension
1806 * of the 2D grid.
1807 * @param range_j the number of items to process along the second dimension
1808 * of the 2D grid.
1809 * @param tile_j the maximum number of items along the first dimension of
1810 * the 2D grid to process in one functor call.
1811 * @param tile_j the maximum number of items along the second dimension of
1812 * the 2D grid to process in one functor call.
1813 * @param flags a bitwise combination of zero or more optional flags
1814 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1815 */
1816 template<class T>
1817 inline void pthreadpool_parallelize_2d_tile_2d(
1818 pthreadpool_t threadpool,
1819 const T& functor,
1820 size_t range_i,
1821 size_t range_j,
1822 size_t tile_i,
1823 size_t tile_j,
1824 uint32_t flags = 0)
1825 {
1826 pthreadpool_parallelize_2d_tile_2d(
1827 threadpool,
1828 &libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>,
1829 const_cast<void*>(static_cast<const void*>(&functor)),
1830 range_i,
1831 range_j,
1832 tile_i,
1833 tile_j,
1834 flags);
1835 }
1836
1837 /**
1838 * Process items on a 3D grid.
1839 *
1840 * The function implements a parallel version of the following snippet:
1841 *
1842 * for (size_t i = 0; i < range_i; i++)
1843 * for (size_t j = 0; j < range_j; j++)
1844 * for (size_t k = 0; k < range_k; k++)
1845 * functor(i, j, k);
1846 *
1847 * When the function returns, all items have been processed and the thread pool
1848 * is ready for a new task.
1849 *
1850 * @note If multiple threads call this function with the same thread pool, the
1851 * calls are serialized.
1852 *
1853 * @param threadpool the thread pool to use for parallelisation. If threadpool
1854 * is NULL, all items are processed serially on the calling thread.
1855 * @param functor the functor to call for each tile.
1856 * @param range_i the number of items to process along the first dimension
1857 * of the 3D grid.
1858 * @param range_j the number of items to process along the second dimension
1859 * of the 3D grid.
1860 * @param range_k the number of items to process along the third dimension
1861 * of the 3D grid.
1862 * @param flags a bitwise combination of zero or more optional flags
1863 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1864 */
1865 template<class T>
1866 inline void pthreadpool_parallelize_3d(
1867 pthreadpool_t threadpool,
1868 const T& functor,
1869 size_t range_i,
1870 size_t range_j,
1871 size_t range_k,
1872 uint32_t flags = 0)
1873 {
1874 pthreadpool_parallelize_3d(
1875 threadpool,
1876 &libpthreadpool::detail::call_wrapper_3d<const T>,
1877 const_cast<void*>(static_cast<const void*>(&functor)),
1878 range_i,
1879 range_j,
1880 range_k,
1881 flags);
1882 }
1883
1884 /**
1885 * Process items on a 3D grid with the specified maximum tile size along the
1886 * last grid dimension.
1887 *
1888 * The function implements a parallel version of the following snippet:
1889 *
1890 * for (size_t i = 0; i < range_i; i++)
1891 * for (size_t j = 0; j < range_j; j++)
1892 * for (size_t k = 0; k < range_k; k += tile_k)
1893 * functor(i, j, k, min(range_k - k, tile_k));
1894 *
1895 * When the function returns, all items have been processed and the thread pool
1896 * is ready for a new task.
1897 *
1898 * @note If multiple threads call this function with the same thread pool, the
1899 * calls are serialized.
1900 *
1901 * @param threadpool the thread pool to use for parallelisation. If threadpool
1902 * is NULL, all items are processed serially on the calling thread.
1903 * @param functor the functor to call for each tile.
1904 * @param range_i the number of items to process along the first dimension
1905 * of the 3D grid.
1906 * @param range_j the number of items to process along the second dimension
1907 * of the 3D grid.
1908 * @param range_k the number of items to process along the third dimension
1909 * of the 3D grid.
1910 * @param tile_k the maximum number of items along the third dimension of
1911 * the 3D grid to process in one functor call.
1912 * @param flags a bitwise combination of zero or more optional flags
1913 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1914 */
1915 template<class T>
1916 inline void pthreadpool_parallelize_3d_tile_1d(
1917 pthreadpool_t threadpool,
1918 const T& functor,
1919 size_t range_i,
1920 size_t range_j,
1921 size_t range_k,
1922 size_t tile_k,
1923 uint32_t flags = 0)
1924 {
1925 pthreadpool_parallelize_3d_tile_1d(
1926 threadpool,
1927 &libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>,
1928 const_cast<void*>(static_cast<const void*>(&functor)),
1929 range_i,
1930 range_j,
1931 range_k,
1932 tile_k,
1933 flags);
1934 }
1935
1936 /**
1937 * Process items on a 3D grid with the specified maximum tile size along the
1938 * last two grid dimensions.
1939 *
1940 * The function implements a parallel version of the following snippet:
1941 *
1942 * for (size_t i = 0; i < range_i; i++)
1943 * for (size_t j = 0; j < range_j; j += tile_j)
1944 * for (size_t k = 0; k < range_k; k += tile_k)
1945 * functor(i, j, k,
1946 * min(range_j - j, tile_j), min(range_k - k, tile_k));
1947 *
1948 * When the function returns, all items have been processed and the thread pool
1949 * is ready for a new task.
1950 *
1951 * @note If multiple threads call this function with the same thread pool, the
1952 * calls are serialized.
1953 *
1954 * @param threadpool the thread pool to use for parallelisation. If threadpool
1955 * is NULL, all items are processed serially on the calling thread.
1956 * @param functor the functor to call for each tile.
1957 * @param range_i the number of items to process along the first dimension
1958 * of the 3D grid.
1959 * @param range_j the number of items to process along the second dimension
1960 * of the 3D grid.
1961 * @param range_k the number of items to process along the third dimension
1962 * of the 3D grid.
1963 * @param tile_j the maximum number of items along the second dimension of
1964 * the 3D grid to process in one functor call.
1965 * @param tile_k the maximum number of items along the third dimension of
1966 * the 3D grid to process in one functor call.
1967 * @param flags a bitwise combination of zero or more optional flags
1968 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1969 */
1970 template<class T>
1971 inline void pthreadpool_parallelize_3d_tile_2d(
1972 pthreadpool_t threadpool,
1973 const T& functor,
1974 size_t range_i,
1975 size_t range_j,
1976 size_t range_k,
1977 size_t tile_j,
1978 size_t tile_k,
1979 uint32_t flags = 0)
1980 {
1981 pthreadpool_parallelize_3d_tile_2d(
1982 threadpool,
1983 &libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>,
1984 const_cast<void*>(static_cast<const void*>(&functor)),
1985 range_i,
1986 range_j,
1987 range_k,
1988 tile_j,
1989 tile_k,
1990 flags);
1991 }
1992
1993 /**
1994 * Process items on a 4D grid.
1995 *
1996 * The function implements a parallel version of the following snippet:
1997 *
1998 * for (size_t i = 0; i < range_i; i++)
1999 * for (size_t j = 0; j < range_j; j++)
2000 * for (size_t k = 0; k < range_k; k++)
2001 * for (size_t l = 0; l < range_l; l++)
2002 * functor(i, j, k, l);
2003 *
2004 * When the function returns, all items have been processed and the thread pool
2005 * is ready for a new task.
2006 *
2007 * @note If multiple threads call this function with the same thread pool, the
2008 * calls are serialized.
2009 *
2010 * @param threadpool the thread pool to use for parallelisation. If threadpool
2011 * is NULL, all items are processed serially on the calling thread.
2012 * @param functor the functor to call for each tile.
2013 * @param range_i the number of items to process along the first dimension
2014 * of the 4D grid.
2015 * @param range_j the number of items to process along the second dimension
2016 * of the 4D grid.
2017 * @param range_k the number of items to process along the third dimension
2018 * of the 4D grid.
2019 * @param range_l the number of items to process along the fourth dimension
2020 * of the 4D grid.
2021 * @param flags a bitwise combination of zero or more optional flags
2022 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2023 */
2024 template<class T>
2025 inline void pthreadpool_parallelize_4d(
2026 pthreadpool_t threadpool,
2027 const T& functor,
2028 size_t range_i,
2029 size_t range_j,
2030 size_t range_k,
2031 size_t range_l,
2032 uint32_t flags = 0)
2033 {
2034 pthreadpool_parallelize_4d(
2035 threadpool,
2036 &libpthreadpool::detail::call_wrapper_4d<const T>,
2037 const_cast<void*>(static_cast<const void*>(&functor)),
2038 range_i,
2039 range_j,
2040 range_k,
2041 range_l,
2042 flags);
2043 }
2044
2045 /**
2046 * Process items on a 4D grid with the specified maximum tile size along the
2047 * last grid dimension.
2048 *
2049 * The function implements a parallel version of the following snippet:
2050 *
2051 * for (size_t i = 0; i < range_i; i++)
2052 * for (size_t j = 0; j < range_j; j++)
2053 * for (size_t k = 0; k < range_k; k++)
2054 * for (size_t l = 0; l < range_l; l += tile_l)
2055 * functor(i, j, k, l, min(range_l - l, tile_l));
2056 *
2057 * When the function returns, all items have been processed and the thread pool
2058 * is ready for a new task.
2059 *
2060 * @note If multiple threads call this function with the same thread pool, the
2061 * calls are serialized.
2062 *
2063 * @param threadpool the thread pool to use for parallelisation. If threadpool
2064 * is NULL, all items are processed serially on the calling thread.
2065 * @param functor the functor to call for each tile.
2066 * @param range_i the number of items to process along the first dimension
2067 * of the 4D grid.
2068 * @param range_j the number of items to process along the second dimension
2069 * of the 4D grid.
2070 * @param range_k the number of items to process along the third dimension
2071 * of the 4D grid.
2072 * @param range_l the number of items to process along the fourth dimension
2073 * of the 4D grid.
2074 * @param tile_l the maximum number of items along the fourth dimension of
2075 * the 4D grid to process in one functor call.
2076 * @param flags a bitwise combination of zero or more optional flags
2077 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2078 */
2079 template<class T>
2080 inline void pthreadpool_parallelize_4d_tile_1d(
2081 pthreadpool_t threadpool,
2082 const T& functor,
2083 size_t range_i,
2084 size_t range_j,
2085 size_t range_k,
2086 size_t range_l,
2087 size_t tile_l,
2088 uint32_t flags = 0)
2089 {
2090 pthreadpool_parallelize_4d_tile_1d(
2091 threadpool,
2092 &libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>,
2093 const_cast<void*>(static_cast<const void*>(&functor)),
2094 range_i,
2095 range_j,
2096 range_k,
2097 range_l,
2098 tile_l,
2099 flags);
2100 }
2101
2102 /**
2103 * Process items on a 4D grid with the specified maximum tile size along the
2104 * last two grid dimensions.
2105 *
2106 * The function implements a parallel version of the following snippet:
2107 *
2108 * for (size_t i = 0; i < range_i; i++)
2109 * for (size_t j = 0; j < range_j; j++)
2110 * for (size_t k = 0; k < range_k; k += tile_k)
2111 * for (size_t l = 0; l < range_l; l += tile_l)
2112 * functor(i, j, k, l,
2113 * min(range_k - k, tile_k), min(range_l - l, tile_l));
2114 *
2115 * When the function returns, all items have been processed and the thread pool
2116 * is ready for a new task.
2117 *
2118 * @note If multiple threads call this function with the same thread pool, the
2119 * calls are serialized.
2120 *
2121 * @param threadpool the thread pool to use for parallelisation. If threadpool
2122 * is NULL, all items are processed serially on the calling thread.
2123 * @param functor the functor to call for each tile.
2124 * @param range_i the number of items to process along the first dimension
2125 * of the 4D grid.
2126 * @param range_j the number of items to process along the second dimension
2127 * of the 4D grid.
2128 * @param range_k the number of items to process along the third dimension
2129 * of the 4D grid.
2130 * @param range_l the number of items to process along the fourth dimension
2131 * of the 4D grid.
2132 * @param tile_k the maximum number of items along the third dimension of
2133 * the 4D grid to process in one functor call.
2134 * @param tile_l the maximum number of items along the fourth dimension of
2135 * the 4D grid to process in one functor call.
2136 * @param flags a bitwise combination of zero or more optional flags
2137 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2138 */
2139 template<class T>
2140 inline void pthreadpool_parallelize_4d_tile_2d(
2141 pthreadpool_t threadpool,
2142 const T& functor,
2143 size_t range_i,
2144 size_t range_j,
2145 size_t range_k,
2146 size_t range_l,
2147 size_t tile_k,
2148 size_t tile_l,
2149 uint32_t flags = 0)
2150 {
2151 pthreadpool_parallelize_4d_tile_2d(
2152 threadpool,
2153 &libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>,
2154 const_cast<void*>(static_cast<const void*>(&functor)),
2155 range_i,
2156 range_j,
2157 range_k,
2158 range_l,
2159 tile_k,
2160 tile_l,
2161 flags);
2162 }
2163
2164 /**
2165 * Process items on a 5D grid.
2166 *
2167 * The function implements a parallel version of the following snippet:
2168 *
2169 * for (size_t i = 0; i < range_i; i++)
2170 * for (size_t j = 0; j < range_j; j++)
2171 * for (size_t k = 0; k < range_k; k++)
2172 * for (size_t l = 0; l < range_l; l++)
2173 * for (size_t m = 0; m < range_m; m++)
2174 * functor(i, j, k, l, m);
2175 *
2176 * When the function returns, all items have been processed and the thread pool
2177 * is ready for a new task.
2178 *
2179 * @note If multiple threads call this function with the same thread pool, the
2180 * calls are serialized.
2181 *
2182 * @param threadpool the thread pool to use for parallelisation. If threadpool
2183 * is NULL, all items are processed serially on the calling thread.
2184 * @param functor the functor to call for each tile.
2185 * @param range_i the number of items to process along the first dimension
2186 * of the 5D grid.
2187 * @param range_j the number of items to process along the second dimension
2188 * of the 5D grid.
2189 * @param range_k the number of items to process along the third dimension
2190 * of the 5D grid.
2191 * @param range_l the number of items to process along the fourth dimension
2192 * of the 5D grid.
2193 * @param range_m the number of items to process along the fifth dimension
2194 * of the 5D grid.
2195 * @param flags a bitwise combination of zero or more optional flags
2196 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2197 */
2198 template<class T>
2199 inline void pthreadpool_parallelize_5d(
2200 pthreadpool_t threadpool,
2201 const T& functor,
2202 size_t range_i,
2203 size_t range_j,
2204 size_t range_k,
2205 size_t range_l,
2206 size_t range_m,
2207 uint32_t flags = 0)
2208 {
2209 pthreadpool_parallelize_5d(
2210 threadpool,
2211 &libpthreadpool::detail::call_wrapper_5d<const T>,
2212 const_cast<void*>(static_cast<const void*>(&functor)),
2213 range_i,
2214 range_j,
2215 range_k,
2216 range_l,
2217 range_m,
2218 flags);
2219 }
2220
2221 /**
2222 * Process items on a 5D grid with the specified maximum tile size along the
2223 * last grid dimension.
2224 *
2225 * The function implements a parallel version of the following snippet:
2226 *
2227 * for (size_t i = 0; i < range_i; i++)
2228 * for (size_t j = 0; j < range_j; j++)
2229 * for (size_t k = 0; k < range_k; k++)
2230 * for (size_t l = 0; l < range_l; l++)
2231 * for (size_t m = 0; m < range_m; m += tile_m)
2232 * functor(i, j, k, l, m, min(range_m - m, tile_m));
2233 *
2234 * When the function returns, all items have been processed and the thread pool
2235 * is ready for a new task.
2236 *
2237 * @note If multiple threads call this function with the same thread pool, the
2238 * calls are serialized.
2239 *
2240 * @param threadpool the thread pool to use for parallelisation. If threadpool
2241 * is NULL, all items are processed serially on the calling thread.
2242 * @param functor the functor to call for each tile.
2243 * @param range_i the number of items to process along the first dimension
2244 * of the 5D grid.
2245 * @param range_j the number of items to process along the second dimension
2246 * of the 5D grid.
2247 * @param range_k the number of items to process along the third dimension
2248 * of the 5D grid.
2249 * @param range_l the number of items to process along the fourth dimension
2250 * of the 5D grid.
2251 * @param range_m the number of items to process along the fifth dimension
2252 * of the 5D grid.
2253 * @param tile_m the maximum number of items along the fifth dimension of
2254 * the 5D grid to process in one functor call.
2255 * @param flags a bitwise combination of zero or more optional flags
2256 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2257 */
2258 template<class T>
2259 inline void pthreadpool_parallelize_5d_tile_1d(
2260 pthreadpool_t threadpool,
2261 const T& functor,
2262 size_t range_i,
2263 size_t range_j,
2264 size_t range_k,
2265 size_t range_l,
2266 size_t range_m,
2267 size_t tile_m,
2268 uint32_t flags = 0)
2269 {
2270 pthreadpool_parallelize_5d_tile_1d(
2271 threadpool,
2272 &libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>,
2273 const_cast<void*>(static_cast<const void*>(&functor)),
2274 range_i,
2275 range_j,
2276 range_k,
2277 range_l,
2278 range_m,
2279 tile_m,
2280 flags);
2281 }
2282
2283 /**
2284 * Process items on a 5D grid with the specified maximum tile size along the
2285 * last two grid dimensions.
2286 *
2287 * The function implements a parallel version of the following snippet:
2288 *
2289 * for (size_t i = 0; i < range_i; i++)
2290 * for (size_t j = 0; j < range_j; j++)
2291 * for (size_t k = 0; k < range_k; k++)
2292 * for (size_t l = 0; l < range_l; l += tile_l)
2293 * for (size_t m = 0; m < range_m; m += tile_m)
2294 * functor(i, j, k, l, m,
2295 * min(range_l - l, tile_l), min(range_m - m, tile_m));
2296 *
2297 * When the function returns, all items have been processed and the thread pool
2298 * is ready for a new task.
2299 *
2300 * @note If multiple threads call this function with the same thread pool, the
2301 * calls are serialized.
2302 *
2303 * @param threadpool the thread pool to use for parallelisation. If threadpool
2304 * is NULL, all items are processed serially on the calling thread.
2305 * @param functor the functor to call for each tile.
2306 * @param range_i the number of items to process along the first dimension
2307 * of the 5D grid.
2308 * @param range_j the number of items to process along the second dimension
2309 * of the 5D grid.
2310 * @param range_k the number of items to process along the third dimension
2311 * of the 5D grid.
2312 * @param range_l the number of items to process along the fourth dimension
2313 * of the 5D grid.
2314 * @param range_m the number of items to process along the fifth dimension
2315 * of the 5D grid.
2316 * @param tile_l the maximum number of items along the fourth dimension of
2317 * the 5D grid to process in one functor call.
2318 * @param tile_m the maximum number of items along the fifth dimension of
2319 * the 5D grid to process in one functor call.
2320 * @param flags a bitwise combination of zero or more optional flags
2321 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2322 */
2323 template<class T>
2324 inline void pthreadpool_parallelize_5d_tile_2d(
2325 pthreadpool_t threadpool,
2326 const T& functor,
2327 size_t range_i,
2328 size_t range_j,
2329 size_t range_k,
2330 size_t range_l,
2331 size_t range_m,
2332 size_t tile_l,
2333 size_t tile_m,
2334 uint32_t flags = 0)
2335 {
2336 pthreadpool_parallelize_5d_tile_2d(
2337 threadpool,
2338 &libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>,
2339 const_cast<void*>(static_cast<const void*>(&functor)),
2340 range_i,
2341 range_j,
2342 range_k,
2343 range_l,
2344 range_m,
2345 tile_l,
2346 tile_m,
2347 flags);
2348 }
2349
2350 /**
2351 * Process items on a 6D grid.
2352 *
2353 * The function implements a parallel version of the following snippet:
2354 *
2355 * for (size_t i = 0; i < range_i; i++)
2356 * for (size_t j = 0; j < range_j; j++)
2357 * for (size_t k = 0; k < range_k; k++)
2358 * for (size_t l = 0; l < range_l; l++)
2359 * for (size_t m = 0; m < range_m; m++)
2360 * for (size_t n = 0; n < range_n; n++)
2361 * functor(i, j, k, l, m, n);
2362 *
2363 * When the function returns, all items have been processed and the thread pool
2364 * is ready for a new task.
2365 *
2366 * @note If multiple threads call this function with the same thread pool, the
2367 * calls are serialized.
2368 *
2369 * @param threadpool the thread pool to use for parallelisation. If threadpool
2370 * is NULL, all items are processed serially on the calling thread.
2371 * @param functor the functor to call for each tile.
2372 * @param range_i the number of items to process along the first dimension
2373 * of the 6D grid.
2374 * @param range_j the number of items to process along the second dimension
2375 * of the 6D grid.
2376 * @param range_k the number of items to process along the third dimension
2377 * of the 6D grid.
2378 * @param range_l the number of items to process along the fourth dimension
2379 * of the 6D grid.
2380 * @param range_m the number of items to process along the fifth dimension
2381 * of the 6D grid.
2382 * @param range_n the number of items to process along the sixth dimension
2383 * of the 6D grid.
2384 * @param tile_n the maximum number of items along the sixth dimension of
2385 * the 6D grid to process in one functor call.
2386 * @param flags a bitwise combination of zero or more optional flags
2387 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2388 */
2389 template<class T>
2390 inline void pthreadpool_parallelize_6d(
2391 pthreadpool_t threadpool,
2392 const T& functor,
2393 size_t range_i,
2394 size_t range_j,
2395 size_t range_k,
2396 size_t range_l,
2397 size_t range_m,
2398 size_t range_n,
2399 uint32_t flags = 0)
2400 {
2401 pthreadpool_parallelize_6d(
2402 threadpool,
2403 &libpthreadpool::detail::call_wrapper_6d<const T>,
2404 const_cast<void*>(static_cast<const void*>(&functor)),
2405 range_i,
2406 range_j,
2407 range_k,
2408 range_l,
2409 range_m,
2410 range_n,
2411 flags);
2412 }
2413
2414 /**
2415 * Process items on a 6D grid with the specified maximum tile size along the
2416 * last grid dimension.
2417 *
2418 * The function implements a parallel version of the following snippet:
2419 *
2420 * for (size_t i = 0; i < range_i; i++)
2421 * for (size_t j = 0; j < range_j; j++)
2422 * for (size_t k = 0; k < range_k; k++)
2423 * for (size_t l = 0; l < range_l; l++)
2424 * for (size_t m = 0; m < range_m; m++)
2425 * for (size_t n = 0; n < range_n; n += tile_n)
2426 * functor(i, j, k, l, m, n, min(range_n - n, tile_n));
2427 *
2428 * When the function returns, all items have been processed and the thread pool
2429 * is ready for a new task.
2430 *
2431 * @note If multiple threads call this function with the same thread pool, the
2432 * calls are serialized.
2433 *
2434 * @param threadpool the thread pool to use for parallelisation. If threadpool
2435 * is NULL, all items are processed serially on the calling thread.
2436 * @param functor the functor to call for each tile.
2437 * @param range_i the number of items to process along the first dimension
2438 * of the 6D grid.
2439 * @param range_j the number of items to process along the second dimension
2440 * of the 6D grid.
2441 * @param range_k the number of items to process along the third dimension
2442 * of the 6D grid.
2443 * @param range_l the number of items to process along the fourth dimension
2444 * of the 6D grid.
2445 * @param range_m the number of items to process along the fifth dimension
2446 * of the 6D grid.
2447 * @param range_n the number of items to process along the sixth dimension
2448 * of the 6D grid.
2449 * @param tile_n the maximum number of items along the sixth dimension of
2450 * the 6D grid to process in one functor call.
2451 * @param flags a bitwise combination of zero or more optional flags
2452 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2453 */
2454 template<class T>
2455 inline void pthreadpool_parallelize_6d_tile_1d(
2456 pthreadpool_t threadpool,
2457 const T& functor,
2458 size_t range_i,
2459 size_t range_j,
2460 size_t range_k,
2461 size_t range_l,
2462 size_t range_m,
2463 size_t range_n,
2464 size_t tile_n,
2465 uint32_t flags = 0)
2466 {
2467 pthreadpool_parallelize_6d_tile_1d(
2468 threadpool,
2469 &libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>,
2470 const_cast<void*>(static_cast<const void*>(&functor)),
2471 range_i,
2472 range_j,
2473 range_k,
2474 range_l,
2475 range_m,
2476 range_n,
2477 tile_n,
2478 flags);
2479 }
2480
2481 /**
2482 * Process items on a 6D grid with the specified maximum tile size along the
2483 * last two grid dimensions.
2484 *
2485 * The function implements a parallel version of the following snippet:
2486 *
2487 * for (size_t i = 0; i < range_i; i++)
2488 * for (size_t j = 0; j < range_j; j++)
2489 * for (size_t k = 0; k < range_k; k++)
2490 * for (size_t l = 0; l < range_l; l++)
2491 * for (size_t m = 0; m < range_m; m += tile_m)
2492 * for (size_t n = 0; n < range_n; n += tile_n)
2493 * functor(i, j, k, l, m, n,
2494 * min(range_m - m, tile_m), min(range_n - n, tile_n));
2495 *
2496 * When the function returns, all items have been processed and the thread pool
2497 * is ready for a new task.
2498 *
2499 * @note If multiple threads call this function with the same thread pool, the
2500 * calls are serialized.
2501 *
2502 * @param threadpool the thread pool to use for parallelisation. If threadpool
2503 * is NULL, all items are processed serially on the calling thread.
2504 * @param functor the functor to call for each tile.
2505 * @param range_i the number of items to process along the first dimension
2506 * of the 6D grid.
2507 * @param range_j the number of items to process along the second dimension
2508 * of the 6D grid.
2509 * @param range_k the number of items to process along the third dimension
2510 * of the 6D grid.
2511 * @param range_l the number of items to process along the fourth dimension
2512 * of the 6D grid.
2513 * @param range_m the number of items to process along the fifth dimension
2514 * of the 6D grid.
2515 * @param range_n the number of items to process along the sixth dimension
2516 * of the 6D grid.
2517 * @param tile_m the maximum number of items along the fifth dimension of
2518 * the 6D grid to process in one functor call.
2519 * @param tile_n the maximum number of items along the sixth dimension of
2520 * the 6D grid to process in one functor call.
2521 * @param flags a bitwise combination of zero or more optional flags
2522 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2523 */
2524 template<class T>
2525 inline void pthreadpool_parallelize_6d_tile_2d(
2526 pthreadpool_t threadpool,
2527 const T& functor,
2528 size_t range_i,
2529 size_t range_j,
2530 size_t range_k,
2531 size_t range_l,
2532 size_t range_m,
2533 size_t range_n,
2534 size_t tile_m,
2535 size_t tile_n,
2536 uint32_t flags = 0)
2537 {
2538 pthreadpool_parallelize_6d_tile_2d(
2539 threadpool,
2540 &libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>,
2541 const_cast<void*>(static_cast<const void*>(&functor)),
2542 range_i,
2543 range_j,
2544 range_k,
2545 range_l,
2546 range_m,
2547 range_n,
2548 tile_m,
2549 tile_n,
2550 flags);
2551 }
2552
2553 #endif /* __cplusplus */
2554
2555 #endif /* PTHREADPOOL_H_ */
2556