• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef PTHREADPOOL_H_
2 #define PTHREADPOOL_H_
3 
4 #include <stddef.h>
5 #include <stdint.h>
6 
7 typedef struct pthreadpool* pthreadpool_t;
8 
9 typedef void (*pthreadpool_task_1d_t)(void*, size_t);
10 typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t);
11 typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
12 typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
13 typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t);
14 typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
15 typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
16 typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
17 typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
18 typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t, size_t, size_t);
19 typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
20 typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
21 typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
22 typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
23 typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
24 typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
25 typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
26 typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
27 typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
28 typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
29 
30 typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
31 typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t);
32 typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
33 typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
34 typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
35 typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
36 
37 typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
38 typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
39 
40 
41 /**
42  * Disable support for denormalized numbers to the maximum extent possible for
43  * the duration of the computation.
44  *
45  * Handling denormalized floating-point numbers is often implemented in
46  * microcode, and incurs significant performance degradation. This hint
47  * instructs the thread pool to disable support for denormalized numbers before
48  * running the computation by manipulating architecture-specific control
49  * registers, and restore the initial value of control registers after the
50  * computation is complete. The thread pool temporary disables denormalized
51  * numbers on all threads involved in the computation (i.e. the caller threads,
52  * and potentially worker threads).
53  *
54  * Disabling denormalized numbers may have a small negative effect on results'
55  * accuracy. As various architectures differ in capabilities to control
56  * processing of denormalized numbers, using this flag may also hurt results'
57  * reproducibility across different instruction set architectures.
58  */
59 #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
60 
61 /**
62  * Yield worker threads to the system scheduler after the operation is finished.
63  *
64  * Force workers to use kernel wait (instead of active spin-wait by default) for
65  * new commands after this command is processed. This flag affects only the
66  * immediate next operation on this thread pool. To make the thread pool always
67  * use kernel wait, pass this flag to all parallelization functions.
68  */
69 #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
70 
71 #ifdef __cplusplus
72 extern "C" {
73 #endif
74 
75 /**
76  * Create a thread pool with the specified number of threads.
77  *
78  * @param  threads_count  the number of threads in the thread pool.
79  *    A value of 0 has special interpretation: it creates a thread pool with as
80  *    many threads as there are logical processors in the system.
81  *
82  * @returns  A pointer to an opaque thread pool object if the call is
83  *    successful, or NULL pointer if the call failed.
84  */
85 pthreadpool_t pthreadpool_create(size_t threads_count);
86 
87 /**
88  * Query the number of threads in a thread pool.
89  *
90  * @param  threadpool  the thread pool to query.
91  *
92  * @returns  The number of threads in the thread pool.
93  */
94 size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
95 
96 /**
97  * Process items on a 1D grid.
98  *
99  * The function implements a parallel version of the following snippet:
100  *
101  *   for (size_t i = 0; i < range; i++)
102  *     function(context, i);
103  *
104  * When the function returns, all items have been processed and the thread pool
105  * is ready for a new task.
106  *
107  * @note If multiple threads call this function with the same thread pool, the
108  *    calls are serialized.
109  *
110  * @param threadpool  the thread pool to use for parallelisation. If threadpool
111  *    is NULL, all items are processed serially on the calling thread.
112  * @param function    the function to call for each item.
113  * @param context     the first argument passed to the specified function.
114  * @param range       the number of items on the 1D grid to process. The
115  *    specified function will be called once for each item.
116  * @param flags       a bitwise combination of zero or more optional flags
117  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
118  */
119 void pthreadpool_parallelize_1d(
120 	pthreadpool_t threadpool,
121 	pthreadpool_task_1d_t function,
122 	void* context,
123 	size_t range,
124 	uint32_t flags);
125 
126 /**
127  * Process items on a 1D grid passing along the current thread id.
128  *
129  * The function implements a parallel version of the following snippet:
130  *
131  *   for (size_t i = 0; i < range; i++)
132  *     function(context, thread_index, i);
133  *
134  * When the function returns, all items have been processed and the thread pool
135  * is ready for a new task.
136  *
137  * @note If multiple threads call this function with the same thread pool, the
138  *    calls are serialized.
139  *
140  * @param threadpool  the thread pool to use for parallelisation. If threadpool
141  *    is NULL, all items are processed serially on the calling thread.
142  * @param function    the function to call for each item.
143  * @param context     the first argument passed to the specified function.
144  * @param range       the number of items on the 1D grid to process. The
145  *    specified function will be called once for each item.
146  * @param flags       a bitwise combination of zero or more optional flags
147  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
148  */
149 void pthreadpool_parallelize_1d_with_thread(
150 	pthreadpool_t threadpool,
151 	pthreadpool_task_1d_with_thread_t function,
152 	void* context,
153 	size_t range,
154 	uint32_t flags);
155 
156 /**
157  * Process items on a 1D grid using a microarchitecture-aware task function.
158  *
159  * The function implements a parallel version of the following snippet:
160  *
161  *   uint32_t uarch_index = cpuinfo_initialize() ?
162  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
163  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
164  *   for (size_t i = 0; i < range; i++)
165  *     function(context, uarch_index, i);
166  *
167  * When the function returns, all items have been processed and the thread pool
168  * is ready for a new task.
169  *
170  * @note If multiple threads call this function with the same thread pool, the
171  *    calls are serialized.
172  *
173  * @param threadpool           the thread pool to use for parallelisation. If
174  *    threadpool is NULL, all items are processed serially on the calling
175  *    thread.
176  * @param function             the function to call for each item.
177  * @param context              the first argument passed to the specified
178  *    function.
179  * @param default_uarch_index  the microarchitecture index to use when
180  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
181  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
182  *    max_uarch_index value.
183  * @param max_uarch_index      the maximum microarchitecture index expected by
184  *    the specified function. If the index returned by
185  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
186  *    will be used instead. default_uarch_index can exceed max_uarch_index.
187  * @param range                the number of items on the 1D grid to process.
188  *    The specified function will be called once for each item.
189  * @param flags                a bitwise combination of zero or more optional
190  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
191  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
192  */
193 void pthreadpool_parallelize_1d_with_uarch(
194 	pthreadpool_t threadpool,
195 	pthreadpool_task_1d_with_id_t function,
196 	void* context,
197 	uint32_t default_uarch_index,
198 	uint32_t max_uarch_index,
199 	size_t range,
200 	uint32_t flags);
201 
202 /**
203  * Process items on a 1D grid with specified maximum tile size.
204  *
205  * The function implements a parallel version of the following snippet:
206  *
207  *   for (size_t i = 0; i < range; i += tile)
208  *     function(context, i, min(range - i, tile));
209  *
210  * When the call returns, all items have been processed and the thread pool is
211  * ready for a new task.
212  *
213  * @note If multiple threads call this function with the same thread pool,
214  *    the calls are serialized.
215  *
216  * @param threadpool  the thread pool to use for parallelisation. If threadpool
217  *    is NULL, all items are processed serially on the calling thread.
218  * @param function    the function to call for each tile.
219  * @param context     the first argument passed to the specified function.
220  * @param range       the number of items on the 1D grid to process.
221  * @param tile        the maximum number of items on the 1D grid to process in
222  *    one function call.
223  * @param flags       a bitwise combination of zero or more optional flags
224  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
225  */
226 void pthreadpool_parallelize_1d_tile_1d(
227 	pthreadpool_t threadpool,
228 	pthreadpool_task_1d_tile_1d_t function,
229 	void* context,
230 	size_t range,
231 	size_t tile,
232 	uint32_t flags);
233 
234 /**
235  * Process items on a 2D grid.
236  *
237  * The function implements a parallel version of the following snippet:
238  *
239  *   for (size_t i = 0; i < range_i; i++)
240  *     for (size_t j = 0; j < range_j; j++)
241  *       function(context, i, j);
242  *
243  * When the function returns, all items have been processed and the thread pool
244  * is ready for a new task.
245  *
246  * @note If multiple threads call this function with the same thread pool, the
247  *    calls are serialized.
248  *
249  * @param threadpool  the thread pool to use for parallelisation. If threadpool
250  *    is NULL, all items are processed serially on the calling thread.
251  * @param function    the function to call for each item.
252  * @param context     the first argument passed to the specified function.
253  * @param range_i     the number of items to process along the first dimension
254  *    of the 2D grid.
255  * @param range_j     the number of items to process along the second dimension
256  *    of the 2D grid.
257  * @param flags       a bitwise combination of zero or more optional flags
258  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
259  */
260 void pthreadpool_parallelize_2d(
261 	pthreadpool_t threadpool,
262 	pthreadpool_task_2d_t function,
263 	void* context,
264 	size_t range_i,
265 	size_t range_j,
266 	uint32_t flags);
267 
268 /**
269  * Process items on a 2D grid passing along the current thread id.
270  *
271  * The function implements a parallel version of the following snippet:
272  *
273  *   for (size_t i = 0; i < range_i; i++)
274  *     for (size_t j = 0; j < range_j; j++)
275  *       function(context, thread_index, i, j);
276  *
277  * When the function returns, all items have been processed and the thread pool
278  * is ready for a new task.
279  *
280  * @note If multiple threads call this function with the same thread pool, the
281  *    calls are serialized.
282  *
283  * @param threadpool  the thread pool to use for parallelisation. If threadpool
284  *    is NULL, all items are processed serially on the calling thread.
285  * @param function    the function to call for each item.
286  * @param context     the first argument passed to the specified function.
287  * @param range_i     the number of items to process along the first dimension
288  *    of the 2D grid.
289  * @param range_j     the number of items to process along the second dimension
290  *    of the 2D grid.
291  * @param flags       a bitwise combination of zero or more optional flags
292  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
293  */
294 void pthreadpool_parallelize_2d_with_thread(
295 	pthreadpool_t threadpool,
296 	pthreadpool_task_2d_with_thread_t function,
297 	void* context,
298 	size_t range_i,
299 	size_t range_j,
300 	uint32_t flags);
301 
302 /**
303  * Process items on a 2D grid with the specified maximum tile size along the
304  * last grid dimension.
305  *
306  * The function implements a parallel version of the following snippet:
307  *
308  *   for (size_t i = 0; i < range_i; i++)
309  *     for (size_t j = 0; j < range_j; j += tile_j)
310  *       function(context, i, j, min(range_j - j, tile_j));
311  *
312  * When the function returns, all items have been processed and the thread pool
313  * is ready for a new task.
314  *
315  * @note If multiple threads call this function with the same thread pool, the
316  *    calls are serialized.
317  *
318  * @param threadpool  the thread pool to use for parallelisation. If threadpool
319  *    is NULL, all items are processed serially on the calling thread.
320  * @param function    the function to call for each tile.
321  * @param context     the first argument passed to the specified function.
322  * @param range_i     the number of items to process along the first dimension
323  *    of the 2D grid.
324  * @param range_j     the number of items to process along the second dimension
325  *    of the 2D grid.
326  * @param tile_j      the maximum number of items along the second dimension of
327  *    the 2D grid to process in one function call.
328  * @param flags       a bitwise combination of zero or more optional flags
329  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
330  */
331 void pthreadpool_parallelize_2d_tile_1d(
332 	pthreadpool_t threadpool,
333 	pthreadpool_task_2d_tile_1d_t function,
334 	void* context,
335 	size_t range_i,
336 	size_t range_j,
337 	size_t tile_j,
338 	uint32_t flags);
339 
340 /**
341  * Process items on a 2D grid with the specified maximum tile size along the
342  * last grid dimension using a microarchitecture-aware task function.
343  *
344  * The function implements a parallel version of the following snippet:
345  *
346  *   uint32_t uarch_index = cpuinfo_initialize() ?
347  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
348  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
349  *   for (size_t i = 0; i < range_i; i++)
350  *     for (size_t j = 0; j < range_j; j += tile_j)
351  *       function(context, uarch_index, i, j, min(range_j - j, tile_j));
352  *
353  * When the function returns, all items have been processed and the thread pool
354  * is ready for a new task.
355  *
356  * @note If multiple threads call this function with the same thread pool, the
357  *    calls are serialized.
358  *
359  * @param threadpool  the thread pool to use for parallelisation. If threadpool
360  *    is NULL, all items are processed serially on the calling thread.
361  * @param function    the function to call for each tile.
362  * @param context     the first argument passed to the specified function.
363  * @param default_uarch_index  the microarchitecture index to use when
364  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
365  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
366  *    max_uarch_index value.
367  * @param max_uarch_index      the maximum microarchitecture index expected by
368  *    the specified function. If the index returned by
369  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
370  *    will be used instead. default_uarch_index can exceed max_uarch_index.
371  * @param range_i     the number of items to process along the first dimension
372  *    of the 2D grid.
373  * @param range_j     the number of items to process along the second dimension
374  *    of the 2D grid.
375  * @param tile_j      the maximum number of items along the second dimension of
376  *    the 2D grid to process in one function call.
377  * @param flags       a bitwise combination of zero or more optional flags
378  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
379  */
380 void pthreadpool_parallelize_2d_tile_1d_with_uarch(
381 	pthreadpool_t threadpool,
382 	pthreadpool_task_2d_tile_1d_with_id_t function,
383 	void* context,
384 	uint32_t default_uarch_index,
385 	uint32_t max_uarch_index,
386 	size_t range_i,
387 	size_t range_j,
388 	size_t tile_j,
389 	uint32_t flags);
390 
391 /**
392  * Process items on a 2D grid with the specified maximum tile size along the
393  * last grid dimension using a microarchitecture-aware task function and passing
394  * along the current thread id.
395  *
396  * The function implements a parallel version of the following snippet:
397  *
398  *   uint32_t uarch_index = cpuinfo_initialize() ?
399  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
400  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
401  *   for (size_t i = 0; i < range_i; i++)
402  *     for (size_t j = 0; j < range_j; j += tile_j)
403  *       function(context, uarch_index, thread_index, i, j, min(range_j - j, tile_j));
404  *
405  * When the function returns, all items have been processed and the thread pool
406  * is ready for a new task.
407  *
408  * @note If multiple threads call this function with the same thread pool, the
409  *    calls are serialized.
410  *
411  * @param threadpool  the thread pool to use for parallelisation. If threadpool
412  *    is NULL, all items are processed serially on the calling thread.
413  * @param function    the function to call for each tile.
414  * @param context     the first argument passed to the specified function.
415  * @param default_uarch_index  the microarchitecture index to use when
416  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
417  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
418  *    max_uarch_index value.
419  * @param max_uarch_index      the maximum microarchitecture index expected by
420  *    the specified function. If the index returned by
421  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
422  *    will be used instead. default_uarch_index can exceed max_uarch_index.
423  * @param range_i     the number of items to process along the first dimension
424  *    of the 2D grid.
425  * @param range_j     the number of items to process along the second dimension
426  *    of the 2D grid.
427  * @param tile_j      the maximum number of items along the second dimension of
428  *    the 2D grid to process in one function call.
429  * @param flags       a bitwise combination of zero or more optional flags
430  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
431  */
432 void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(
433 	pthreadpool_t threadpool,
434 	pthreadpool_task_2d_tile_1d_with_id_with_thread_t function,
435 	void* context,
436 	uint32_t default_uarch_index,
437 	uint32_t max_uarch_index,
438 	size_t range_i,
439 	size_t range_j,
440 	size_t tile_j,
441 	uint32_t flags);
442 
443 /**
444  * Process items on a 2D grid with the specified maximum tile size along each
445  * grid dimension.
446  *
447  * The function implements a parallel version of the following snippet:
448  *
449  *   for (size_t i = 0; i < range_i; i += tile_i)
450  *     for (size_t j = 0; j < range_j; j += tile_j)
451  *       function(context, i, j,
452  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
453  *
454  * When the function returns, all items have been processed and the thread pool
455  * is ready for a new task.
456  *
457  * @note If multiple threads call this function with the same thread pool, the
458  *    calls are serialized.
459  *
460  * @param threadpool  the thread pool to use for parallelisation. If threadpool
461  *    is NULL, all items are processed serially on the calling thread.
462  * @param function    the function to call for each tile.
463  * @param context     the first argument passed to the specified function.
464  * @param range_i     the number of items to process along the first dimension
465  *    of the 2D grid.
466  * @param range_j     the number of items to process along the second dimension
467  *    of the 2D grid.
468  * @param tile_j      the maximum number of items along the first dimension of
469  *    the 2D grid to process in one function call.
470  * @param tile_j      the maximum number of items along the second dimension of
471  *    the 2D grid to process in one function call.
472  * @param flags       a bitwise combination of zero or more optional flags
473  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
474  */
475 void pthreadpool_parallelize_2d_tile_2d(
476 	pthreadpool_t threadpool,
477 	pthreadpool_task_2d_tile_2d_t function,
478 	void* context,
479 	size_t range_i,
480 	size_t range_j,
481 	size_t tile_i,
482 	size_t tile_j,
483 	uint32_t flags);
484 
485 /**
486  * Process items on a 2D grid with the specified maximum tile size along each
487  * grid dimension using a microarchitecture-aware task function.
488  *
489  * The function implements a parallel version of the following snippet:
490  *
491  *   uint32_t uarch_index = cpuinfo_initialize() ?
492  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
493  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
494  *   for (size_t i = 0; i < range_i; i += tile_i)
495  *     for (size_t j = 0; j < range_j; j += tile_j)
496  *       function(context, uarch_index, i, j,
497  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
498  *
499  * When the function returns, all items have been processed and the thread pool
500  * is ready for a new task.
501  *
502  * @note If multiple threads call this function with the same thread pool, the
503  *    calls are serialized.
504  *
505  * @param threadpool           the thread pool to use for parallelisation. If
506  *    threadpool is NULL, all items are processed serially on the calling
507  *    thread.
508  * @param function             the function to call for each tile.
509  * @param context              the first argument passed to the specified
510  *    function.
511  * @param default_uarch_index  the microarchitecture index to use when
512  *                             pthreadpool is configured without cpuinfo,
513  *                             cpuinfo initialization failed, or index returned
514  *                             by cpuinfo_get_current_uarch_index() exceeds
515  *                             the max_uarch_index value.
516  * @param max_uarch_index      the maximum microarchitecture index expected
517  *                             by the specified function. If the index returned
518  *                             by cpuinfo_get_current_uarch_index() exceeds this
519  *                             value, default_uarch_index will be used instead.
520  *                             default_uarch_index can exceed max_uarch_index.
521  * @param range_i              the number of items to process along the first
522  *    dimension of the 2D grid.
523  * @param range_j              the number of items to process along the second
524  *    dimension of the 2D grid.
525  * @param tile_j               the maximum number of items along the first
526  *    dimension of the 2D grid to process in one function call.
527  * @param tile_j               the maximum number of items along the second
528  *    dimension of the 2D grid to process in one function call.
529  * @param flags                a bitwise combination of zero or more optional
530  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
531  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
532  */
533 void pthreadpool_parallelize_2d_tile_2d_with_uarch(
534 	pthreadpool_t threadpool,
535 	pthreadpool_task_2d_tile_2d_with_id_t function,
536 	void* context,
537 	uint32_t default_uarch_index,
538 	uint32_t max_uarch_index,
539 	size_t range_i,
540 	size_t range_j,
541 	size_t tile_i,
542 	size_t tile_j,
543 	uint32_t flags);
544 
545 /**
546  * Process items on a 3D grid.
547  *
548  * The function implements a parallel version of the following snippet:
549  *
550  *   for (size_t i = 0; i < range_i; i++)
551  *     for (size_t j = 0; j < range_j; j++)
552  *       for (size_t k = 0; k < range_k; k++)
553  *         function(context, i, j, k);
554  *
555  * When the function returns, all items have been processed and the thread pool
556  * is ready for a new task.
557  *
558  * @note If multiple threads call this function with the same thread pool, the
559  *    calls are serialized.
560  *
561  * @param threadpool  the thread pool to use for parallelisation. If threadpool
562  *    is NULL, all items are processed serially on the calling thread.
563  * @param function    the function to call for each tile.
564  * @param context     the first argument passed to the specified function.
565  * @param range_i     the number of items to process along the first dimension
566  *    of the 3D grid.
567  * @param range_j     the number of items to process along the second dimension
568  *    of the 3D grid.
569  * @param range_k     the number of items to process along the third dimension
570  *    of the 3D grid.
571  * @param flags       a bitwise combination of zero or more optional flags
572  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
573  */
574 void pthreadpool_parallelize_3d(
575 	pthreadpool_t threadpool,
576 	pthreadpool_task_3d_t function,
577 	void* context,
578 	size_t range_i,
579 	size_t range_j,
580 	size_t range_k,
581 	uint32_t flags);
582 
583 /**
584  * Process items on a 3D grid with the specified maximum tile size along the
585  * last grid dimension.
586  *
587  * The function implements a parallel version of the following snippet:
588  *
589  *   for (size_t i = 0; i < range_i; i++)
590  *     for (size_t j = 0; j < range_j; j++)
591  *       for (size_t k = 0; k < range_k; k += tile_k)
592  *         function(context, i, j, k, min(range_k - k, tile_k));
593  *
594  * When the function returns, all items have been processed and the thread pool
595  * is ready for a new task.
596  *
597  * @note If multiple threads call this function with the same thread pool, the
598  *    calls are serialized.
599  *
600  * @param threadpool  the thread pool to use for parallelisation. If threadpool
601  *    is NULL, all items are processed serially on the calling thread.
602  * @param function    the function to call for each tile.
603  * @param context     the first argument passed to the specified function.
604  * @param range_i     the number of items to process along the first dimension
605  *    of the 3D grid.
606  * @param range_j     the number of items to process along the second dimension
607  *    of the 3D grid.
608  * @param range_k     the number of items to process along the third dimension
609  *    of the 3D grid.
610  * @param tile_k      the maximum number of items along the third dimension of
611  *    the 3D grid to process in one function call.
612  * @param flags       a bitwise combination of zero or more optional flags
613  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
614  */
615 void pthreadpool_parallelize_3d_tile_1d(
616 	pthreadpool_t threadpool,
617 	pthreadpool_task_3d_tile_1d_t function,
618 	void* context,
619 	size_t range_i,
620 	size_t range_j,
621 	size_t range_k,
622 	size_t tile_k,
623 	uint32_t flags);
624 
625 /**
626  * Process items on a 3D grid with the specified maximum tile size along the
627  * last grid dimension and passing along the current thread id.
628  *
629  * The function implements a parallel version of the following snippet:
630  *
631  *   for (size_t i = 0; i < range_i; i++)
632  *     for (size_t j = 0; j < range_j; j++)
633  *       for (size_t k = 0; k < range_k; k += tile_k)
634  *         function(context, thread_index, i, j, k, min(range_k - k, tile_k));
635  *
636  * When the function returns, all items have been processed and the thread pool
637  * is ready for a new task.
638  *
639  * @note If multiple threads call this function with the same thread pool, the
640  *    calls are serialized.
641  *
642  * @param threadpool  the thread pool to use for parallelisation. If threadpool
643  *    is NULL, all items are processed serially on the calling thread.
644  * @param function    the function to call for each tile.
645  * @param context     the first argument passed to the specified function.
646  * @param range_i     the number of items to process along the first dimension
647  *    of the 3D grid.
648  * @param range_j     the number of items to process along the second dimension
649  *    of the 3D grid.
650  * @param range_k     the number of items to process along the third dimension
651  *    of the 3D grid.
652  * @param tile_k      the maximum number of items along the third dimension of
653  *    the 3D grid to process in one function call.
654  * @param flags       a bitwise combination of zero or more optional flags
655  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
656  */
657 void pthreadpool_parallelize_3d_tile_1d_with_thread(
658   pthreadpool_t threadpool,
659   pthreadpool_task_3d_tile_1d_with_thread_t function,
660   void* context,
661   size_t range_i,
662   size_t range_j,
663   size_t range_k,
664   size_t tile_k,
665   uint32_t flags);
666 
667 /**
668  * Process items on a 3D grid with the specified maximum tile size along the
669  * last grid dimension using a microarchitecture-aware task function.
670  *
671  * The function implements a parallel version of the following snippet:
672  *
673  *   uint32_t uarch_index = cpuinfo_initialize() ?
674  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
675  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
676  *   for (size_t i = 0; i < range_i; i++)
677  *     for (size_t j = 0; j < range_j; j++)
678  *       for (size_t k = 0; k < range_k; k += tile_k)
679  *         function(context, uarch_index, i, j, k, min(range_k - k, tile_k));
680  *
681  * When the function returns, all items have been processed and the thread pool
682  * is ready for a new task.
683  *
684  * @note If multiple threads call this function with the same thread pool, the
685  *    calls are serialized.
686  *
687  * @param threadpool           the thread pool to use for parallelisation. If
688  *    threadpool is NULL, all items are processed serially on the calling
689  *    thread.
690  * @param function             the function to call for each tile.
691  * @param context              the first argument passed to the specified
692  *    function.
693  * @param default_uarch_index  the microarchitecture index to use when
694  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
695  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
696  *    max_uarch_index value.
697  * @param max_uarch_index      the maximum microarchitecture index expected by
698  *    the specified function. If the index returned by
699  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
700  *    will be used instead. default_uarch_index can exceed max_uarch_index.
701  * @param range_i              the number of items to process along the first
702  *    dimension of the 3D grid.
703  * @param range_j              the number of items to process along the second
704  *    dimension of the 3D grid.
705  * @param range_k              the number of items to process along the third
706  *    dimension of the 3D grid.
707  * @param tile_k               the maximum number of items along the third
708  *    dimension of the 3D grid to process in one function call.
709  * @param flags                a bitwise combination of zero or more optional
710  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
711  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
712  */
713 void pthreadpool_parallelize_3d_tile_1d_with_uarch(
714 	pthreadpool_t threadpool,
715 	pthreadpool_task_3d_tile_1d_with_id_t function,
716 	void* context,
717 	uint32_t default_uarch_index,
718 	uint32_t max_uarch_index,
719 	size_t range_i,
720 	size_t range_j,
721 	size_t range_k,
722 	size_t tile_k,
723 	uint32_t flags);
724 
725 /**
726  * Process items on a 3D grid with the specified maximum tile size along the
727  * last grid dimension using a microarchitecture-aware task function and passing
728  * along the current thread id.
729  *
730  * The function implements a parallel version of the following snippet:
731  *
732  *   uint32_t uarch_index = cpuinfo_initialize() ?
733  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
734  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
735  *   for (size_t i = 0; i < range_i; i++)
736  *     for (size_t j = 0; j < range_j; j++)
737  *       for (size_t k = 0; k < range_k; k += tile_k)
738  *         function(context, uarch_index, thread_index, i, j, k, min(range_k - k, tile_k));
739  *
740  * When the function returns, all items have been processed and the thread pool
741  * is ready for a new task.
742  *
743  * @note If multiple threads call this function with the same thread pool, the
744  *    calls are serialized.
745  *
746  * @param threadpool           the thread pool to use for parallelisation. If
747  *    threadpool is NULL, all items are processed serially on the calling
748  *    thread.
749  * @param function             the function to call for each tile.
750  * @param context              the first argument passed to the specified
751  *    function.
752  * @param default_uarch_index  the microarchitecture index to use when
753  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
754  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
755  *    max_uarch_index value.
756  * @param max_uarch_index      the maximum microarchitecture index expected by
757  *    the specified function. If the index returned by
758  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
759  *    will be used instead. default_uarch_index can exceed max_uarch_index.
760  * @param range_i              the number of items to process along the first
761  *    dimension of the 3D grid.
762  * @param range_j              the number of items to process along the second
763  *    dimension of the 3D grid.
764  * @param range_k              the number of items to process along the third
765  *    dimension of the 3D grid.
766  * @param tile_k               the maximum number of items along the third
767  *    dimension of the 3D grid to process in one function call.
768  * @param flags                a bitwise combination of zero or more optional
769  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
770  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
771  */
772 void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(
773 	pthreadpool_t threadpool,
774 	pthreadpool_task_3d_tile_1d_with_id_with_thread_t function,
775 	void* context,
776 	uint32_t default_uarch_index,
777 	uint32_t max_uarch_index,
778 	size_t range_i,
779 	size_t range_j,
780 	size_t range_k,
781 	size_t tile_k,
782 	uint32_t flags);
783 
784 /**
785  * Process items on a 3D grid with the specified maximum tile size along the
786  * last two grid dimensions.
787  *
788  * The function implements a parallel version of the following snippet:
789  *
790  *   for (size_t i = 0; i < range_i; i++)
791  *     for (size_t j = 0; j < range_j; j += tile_j)
792  *       for (size_t k = 0; k < range_k; k += tile_k)
793  *         function(context, i, j, k,
794  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
795  *
796  * When the function returns, all items have been processed and the thread pool
797  * is ready for a new task.
798  *
799  * @note If multiple threads call this function with the same thread pool, the
800  *    calls are serialized.
801  *
802  * @param threadpool  the thread pool to use for parallelisation. If threadpool
803  *    is NULL, all items are processed serially on the calling thread.
804  * @param function    the function to call for each tile.
805  * @param context     the first argument passed to the specified function.
806  * @param range_i     the number of items to process along the first dimension
807  *    of the 3D grid.
808  * @param range_j     the number of items to process along the second dimension
809  *    of the 3D grid.
810  * @param range_k     the number of items to process along the third dimension
811  *    of the 3D grid.
812  * @param tile_j      the maximum number of items along the second dimension of
813  *    the 3D grid to process in one function call.
814  * @param tile_k      the maximum number of items along the third dimension of
815  *    the 3D grid to process in one function call.
816  * @param flags       a bitwise combination of zero or more optional flags
817  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
818  */
819 void pthreadpool_parallelize_3d_tile_2d(
820 	pthreadpool_t threadpool,
821 	pthreadpool_task_3d_tile_2d_t function,
822 	void* context,
823 	size_t range_i,
824 	size_t range_j,
825 	size_t range_k,
826 	size_t tile_j,
827 	size_t tile_k,
828 	uint32_t flags);
829 
830 /**
831  * Process items on a 3D grid with the specified maximum tile size along the
832  * last two grid dimensions using a microarchitecture-aware task function.
833  *
834  * The function implements a parallel version of the following snippet:
835  *
836  *   uint32_t uarch_index = cpuinfo_initialize() ?
837  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
838  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
839  *   for (size_t i = 0; i < range_i; i++)
840  *     for (size_t j = 0; j < range_j; j += tile_j)
841  *       for (size_t k = 0; k < range_k; k += tile_k)
842  *         function(context, uarch_index, i, j, k,
843  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
844  *
845  * When the function returns, all items have been processed and the thread pool
846  * is ready for a new task.
847  *
848  * @note If multiple threads call this function with the same thread pool, the
849  *    calls are serialized.
850  *
851  * @param threadpool           the thread pool to use for parallelisation. If
852  *    threadpool is NULL, all items are processed serially on the calling
853  *    thread.
854  * @param function             the function to call for each tile.
855  * @param context              the first argument passed to the specified
856  *    function.
857  * @param default_uarch_index  the microarchitecture index to use when
858  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
859  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
860  *    max_uarch_index value.
861  * @param max_uarch_index      the maximum microarchitecture index expected by
862  *    the specified function. If the index returned by
863  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
864  *    will be used instead. default_uarch_index can exceed max_uarch_index.
865  * @param range_i              the number of items to process along the first
866  *    dimension of the 3D grid.
867  * @param range_j              the number of items to process along the second
868  *    dimension of the 3D grid.
869  * @param range_k              the number of items to process along the third
870  *    dimension of the 3D grid.
871  * @param tile_j               the maximum number of items along the second
872  *    dimension of the 3D grid to process in one function call.
873  * @param tile_k               the maximum number of items along the third
874  *    dimension of the 3D grid to process in one function call.
875  * @param flags                a bitwise combination of zero or more optional
876  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
877  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
878  */
879 void pthreadpool_parallelize_3d_tile_2d_with_uarch(
880 	pthreadpool_t threadpool,
881 	pthreadpool_task_3d_tile_2d_with_id_t function,
882 	void* context,
883 	uint32_t default_uarch_index,
884 	uint32_t max_uarch_index,
885 	size_t range_i,
886 	size_t range_j,
887 	size_t range_k,
888 	size_t tile_j,
889 	size_t tile_k,
890 	uint32_t flags);
891 
892 /**
893  * Process items on a 4D grid.
894  *
895  * The function implements a parallel version of the following snippet:
896  *
897  *   for (size_t i = 0; i < range_i; i++)
898  *     for (size_t j = 0; j < range_j; j++)
899  *       for (size_t k = 0; k < range_k; k++)
900  *         for (size_t l = 0; l < range_l; l++)
901  *           function(context, i, j, k, l);
902  *
903  * When the function returns, all items have been processed and the thread pool
904  * is ready for a new task.
905  *
906  * @note If multiple threads call this function with the same thread pool, the
907  *    calls are serialized.
908  *
909  * @param threadpool  the thread pool to use for parallelisation. If threadpool
910  *    is NULL, all items are processed serially on the calling thread.
911  * @param function    the function to call for each tile.
912  * @param context     the first argument passed to the specified function.
913  * @param range_i     the number of items to process along the first dimension
914  *    of the 4D grid.
915  * @param range_j     the number of items to process along the second dimension
916  *    of the 4D grid.
917  * @param range_k     the number of items to process along the third dimension
918  *    of the 4D grid.
919  * @param range_l     the number of items to process along the fourth dimension
920  *    of the 4D grid.
921  * @param flags       a bitwise combination of zero or more optional flags
922  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
923  */
924 void pthreadpool_parallelize_4d(
925 	pthreadpool_t threadpool,
926 	pthreadpool_task_4d_t function,
927 	void* context,
928 	size_t range_i,
929 	size_t range_j,
930 	size_t range_k,
931 	size_t range_l,
932 	uint32_t flags);
933 
934 /**
935  * Process items on a 4D grid with the specified maximum tile size along the
936  * last grid dimension.
937  *
938  * The function implements a parallel version of the following snippet:
939  *
940  *   for (size_t i = 0; i < range_i; i++)
941  *     for (size_t j = 0; j < range_j; j++)
942  *       for (size_t k = 0; k < range_k; k++)
943  *         for (size_t l = 0; l < range_l; l += tile_l)
944  *           function(context, i, j, k, l, min(range_l - l, tile_l));
945  *
946  * When the function returns, all items have been processed and the thread pool
947  * is ready for a new task.
948  *
949  * @note If multiple threads call this function with the same thread pool, the
950  *    calls are serialized.
951  *
952  * @param threadpool  the thread pool to use for parallelisation. If threadpool
953  *    is NULL, all items are processed serially on the calling thread.
954  * @param function    the function to call for each tile.
955  * @param context     the first argument passed to the specified function.
956  * @param range_i     the number of items to process along the first dimension
957  *    of the 4D grid.
958  * @param range_j     the number of items to process along the second dimension
959  *    of the 4D grid.
960  * @param range_k     the number of items to process along the third dimension
961  *    of the 4D grid.
962  * @param range_l     the number of items to process along the fourth dimension
963  *    of the 4D grid.
964  * @param tile_l      the maximum number of items along the fourth dimension of
965  *    the 4D grid to process in one function call.
966  * @param flags       a bitwise combination of zero or more optional flags
967  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
968  */
969 void pthreadpool_parallelize_4d_tile_1d(
970 	pthreadpool_t threadpool,
971 	pthreadpool_task_4d_tile_1d_t function,
972 	void* context,
973 	size_t range_i,
974 	size_t range_j,
975 	size_t range_k,
976 	size_t range_l,
977 	size_t tile_l,
978 	uint32_t flags);
979 
980 /**
981  * Process items on a 4D grid with the specified maximum tile size along the
982  * last two grid dimensions.
983  *
984  * The function implements a parallel version of the following snippet:
985  *
986  *   for (size_t i = 0; i < range_i; i++)
987  *     for (size_t j = 0; j < range_j; j++)
988  *       for (size_t k = 0; k < range_k; k += tile_k)
989  *         for (size_t l = 0; l < range_l; l += tile_l)
990  *           function(context, i, j, k, l,
991  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
992  *
993  * When the function returns, all items have been processed and the thread pool
994  * is ready for a new task.
995  *
996  * @note If multiple threads call this function with the same thread pool, the
997  *    calls are serialized.
998  *
999  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1000  *    is NULL, all items are processed serially on the calling thread.
1001  * @param function    the function to call for each tile.
1002  * @param context     the first argument passed to the specified function.
1003  * @param range_i     the number of items to process along the first dimension
1004  *    of the 4D grid.
1005  * @param range_j     the number of items to process along the second dimension
1006  *    of the 4D grid.
1007  * @param range_k     the number of items to process along the third dimension
1008  *    of the 4D grid.
1009  * @param range_l     the number of items to process along the fourth dimension
1010  *    of the 4D grid.
1011  * @param tile_k      the maximum number of items along the third dimension of
1012  *    the 4D grid to process in one function call.
1013  * @param tile_l      the maximum number of items along the fourth dimension of
1014  *    the 4D grid to process in one function call.
1015  * @param flags       a bitwise combination of zero or more optional flags
1016  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1017  */
1018 void pthreadpool_parallelize_4d_tile_2d(
1019 	pthreadpool_t threadpool,
1020 	pthreadpool_task_4d_tile_2d_t function,
1021 	void* context,
1022 	size_t range_i,
1023 	size_t range_j,
1024 	size_t range_k,
1025 	size_t range_l,
1026 	size_t tile_k,
1027 	size_t tile_l,
1028 	uint32_t flags);
1029 
1030 /**
1031  * Process items on a 4D grid with the specified maximum tile size along the
1032  * last two grid dimensions using a microarchitecture-aware task function.
1033  *
1034  * The function implements a parallel version of the following snippet:
1035  *
1036  *   uint32_t uarch_index = cpuinfo_initialize() ?
1037  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
1038  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
1039  *   for (size_t i = 0; i < range_i; i++)
1040  *     for (size_t j = 0; j < range_j; j++)
1041  *       for (size_t k = 0; k < range_k; k += tile_k)
1042  *         for (size_t l = 0; l < range_l; l += tile_l)
1043  *           function(context, uarch_index, i, j, k, l,
1044  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
1045  *
1046  * When the function returns, all items have been processed and the thread pool
1047  * is ready for a new task.
1048  *
1049  * @note If multiple threads call this function with the same thread pool, the
1050  *    calls are serialized.
1051  *
1052  * @param threadpool           the thread pool to use for parallelisation. If
1053  *    threadpool is NULL, all items are processed serially on the calling
1054  *    thread.
1055  * @param function             the function to call for each tile.
1056  * @param context              the first argument passed to the specified
1057  *    function.
1058  * @param default_uarch_index  the microarchitecture index to use when
1059  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
1060  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
1061  *    max_uarch_index value.
1062  * @param max_uarch_index      the maximum microarchitecture index expected by
1063  *    the specified function. If the index returned by
1064  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
1065  *    will be used instead. default_uarch_index can exceed max_uarch_index.
1066  * @param range_i              the number of items to process along the first
1067  *    dimension of the 4D grid.
1068  * @param range_j              the number of items to process along the second
1069  *    dimension of the 4D grid.
1070  * @param range_k              the number of items to process along the third
1071  *    dimension of the 4D grid.
1072  * @param range_l              the number of items to process along the fourth
1073  *    dimension of the 4D grid.
1074  * @param tile_k               the maximum number of items along the third
1075  *    dimension of the 4D grid to process in one function call.
1076  * @param tile_l               the maximum number of items along the fourth
1077  *    dimension of the 4D grid to process in one function call.
1078  * @param flags                a bitwise combination of zero or more optional
1079  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
1080  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
1081  */
1082 void pthreadpool_parallelize_4d_tile_2d_with_uarch(
1083 	pthreadpool_t threadpool,
1084 	pthreadpool_task_4d_tile_2d_with_id_t function,
1085 	void* context,
1086 	uint32_t default_uarch_index,
1087 	uint32_t max_uarch_index,
1088 	size_t range_i,
1089 	size_t range_j,
1090 	size_t range_k,
1091 	size_t range_l,
1092 	size_t tile_k,
1093 	size_t tile_l,
1094 	uint32_t flags);
1095 
1096 /**
1097  * Process items on a 5D grid.
1098  *
1099  * The function implements a parallel version of the following snippet:
1100  *
1101  *   for (size_t i = 0; i < range_i; i++)
1102  *     for (size_t j = 0; j < range_j; j++)
1103  *       for (size_t k = 0; k < range_k; k++)
1104  *         for (size_t l = 0; l < range_l; l++)
1105  *           for (size_t m = 0; m < range_m; m++)
1106  *             function(context, i, j, k, l, m);
1107  *
1108  * When the function returns, all items have been processed and the thread pool
1109  * is ready for a new task.
1110  *
1111  * @note If multiple threads call this function with the same thread pool, the
1112  *    calls are serialized.
1113  *
1114  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1115  *    is NULL, all items are processed serially on the calling thread.
1116  * @param function    the function to call for each tile.
1117  * @param context     the first argument passed to the specified function.
1118  * @param range_i     the number of items to process along the first dimension
1119  *    of the 5D grid.
1120  * @param range_j     the number of items to process along the second dimension
1121  *    of the 5D grid.
1122  * @param range_k     the number of items to process along the third dimension
1123  *    of the 5D grid.
1124  * @param range_l     the number of items to process along the fourth dimension
1125  *    of the 5D grid.
1126  * @param range_m     the number of items to process along the fifth dimension
1127  *    of the 5D grid.
1128  * @param flags       a bitwise combination of zero or more optional flags
1129  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1130  */
1131 void pthreadpool_parallelize_5d(
1132 	pthreadpool_t threadpool,
1133 	pthreadpool_task_5d_t function,
1134 	void* context,
1135 	size_t range_i,
1136 	size_t range_j,
1137 	size_t range_k,
1138 	size_t range_l,
1139 	size_t range_m,
1140 	uint32_t flags);
1141 
1142 /**
1143  * Process items on a 5D grid with the specified maximum tile size along the
1144  * last grid dimension.
1145  *
1146  * The function implements a parallel version of the following snippet:
1147  *
1148  *   for (size_t i = 0; i < range_i; i++)
1149  *     for (size_t j = 0; j < range_j; j++)
1150  *       for (size_t k = 0; k < range_k; k++)
1151  *         for (size_t l = 0; l < range_l; l++)
1152  *           for (size_t m = 0; m < range_m; m += tile_m)
1153  *             function(context, i, j, k, l, m, min(range_m - m, tile_m));
1154  *
1155  * When the function returns, all items have been processed and the thread pool
1156  * is ready for a new task.
1157  *
1158  * @note If multiple threads call this function with the same thread pool, the
1159  *    calls are serialized.
1160  *
1161  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1162  *    is NULL, all items are processed serially on the calling thread.
1163  * @param function    the function to call for each tile.
1164  * @param context     the first argument passed to the specified function.
1165  * @param range_i     the number of items to process along the first dimension
1166  *    of the 5D grid.
1167  * @param range_j     the number of items to process along the second dimension
1168  *    of the 5D grid.
1169  * @param range_k     the number of items to process along the third dimension
1170  *    of the 5D grid.
1171  * @param range_l     the number of items to process along the fourth dimension
1172  *    of the 5D grid.
1173  * @param range_m     the number of items to process along the fifth dimension
1174  *    of the 5D grid.
1175  * @param tile_m      the maximum number of items along the fifth dimension of
1176  *    the 5D grid to process in one function call.
1177  * @param flags       a bitwise combination of zero or more optional flags
1178  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1179  */
1180 void pthreadpool_parallelize_5d_tile_1d(
1181 	pthreadpool_t threadpool,
1182 	pthreadpool_task_5d_tile_1d_t function,
1183 	void* context,
1184 	size_t range_i,
1185 	size_t range_j,
1186 	size_t range_k,
1187 	size_t range_l,
1188 	size_t range_m,
1189 	size_t tile_m,
1190 	uint32_t flags);
1191 
1192 /**
1193  * Process items on a 5D grid with the specified maximum tile size along the
1194  * last two grid dimensions.
1195  *
1196  * The function implements a parallel version of the following snippet:
1197  *
1198  *   for (size_t i = 0; i < range_i; i++)
1199  *     for (size_t j = 0; j < range_j; j++)
1200  *       for (size_t k = 0; k < range_k; k++)
1201  *         for (size_t l = 0; l < range_l; l += tile_l)
1202  *           for (size_t m = 0; m < range_m; m += tile_m)
1203  *             function(context, i, j, k, l, m,
1204  *               min(range_l - l, tile_l), min(range_m - m, tile_m));
1205  *
1206  * When the function returns, all items have been processed and the thread pool
1207  * is ready for a new task.
1208  *
1209  * @note If multiple threads call this function with the same thread pool, the
1210  *    calls are serialized.
1211  *
1212  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1213  *    is NULL, all items are processed serially on the calling thread.
1214  * @param function    the function to call for each tile.
1215  * @param context     the first argument passed to the specified function.
1216  * @param range_i     the number of items to process along the first dimension
1217  *    of the 5D grid.
1218  * @param range_j     the number of items to process along the second dimension
1219  *    of the 5D grid.
1220  * @param range_k     the number of items to process along the third dimension
1221  *    of the 5D grid.
1222  * @param range_l     the number of items to process along the fourth dimension
1223  *    of the 5D grid.
1224  * @param range_m     the number of items to process along the fifth dimension
1225  *    of the 5D grid.
1226  * @param tile_l      the maximum number of items along the fourth dimension of
1227  *    the 5D grid to process in one function call.
1228  * @param tile_m      the maximum number of items along the fifth dimension of
1229  *    the 5D grid to process in one function call.
1230  * @param flags       a bitwise combination of zero or more optional flags
1231  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1232  */
1233 void pthreadpool_parallelize_5d_tile_2d(
1234 	pthreadpool_t threadpool,
1235 	pthreadpool_task_5d_tile_2d_t function,
1236 	void* context,
1237 	size_t range_i,
1238 	size_t range_j,
1239 	size_t range_k,
1240 	size_t range_l,
1241 	size_t range_m,
1242 	size_t tile_l,
1243 	size_t tile_m,
1244 	uint32_t flags);
1245 
1246 /**
1247  * Process items on a 6D grid.
1248  *
1249  * The function implements a parallel version of the following snippet:
1250  *
1251  *   for (size_t i = 0; i < range_i; i++)
1252  *     for (size_t j = 0; j < range_j; j++)
1253  *       for (size_t k = 0; k < range_k; k++)
1254  *         for (size_t l = 0; l < range_l; l++)
1255  *           for (size_t m = 0; m < range_m; m++)
1256  *             for (size_t n = 0; n < range_n; n++)
1257  *               function(context, i, j, k, l, m, n);
1258  *
1259  * When the function returns, all items have been processed and the thread pool
1260  * is ready for a new task.
1261  *
1262  * @note If multiple threads call this function with the same thread pool, the
1263  *    calls are serialized.
1264  *
1265  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1266  *    is NULL, all items are processed serially on the calling thread.
1267  * @param function    the function to call for each tile.
1268  * @param context     the first argument passed to the specified function.
1269  * @param range_i     the number of items to process along the first dimension
1270  *    of the 6D grid.
1271  * @param range_j     the number of items to process along the second dimension
1272  *    of the 6D grid.
1273  * @param range_k     the number of items to process along the third dimension
1274  *    of the 6D grid.
1275  * @param range_l     the number of items to process along the fourth dimension
1276  *    of the 6D grid.
1277  * @param range_m     the number of items to process along the fifth dimension
1278  *    of the 6D grid.
1279  * @param range_n     the number of items to process along the sixth dimension
1280  *    of the 6D grid.
1281  * @param tile_n      the maximum number of items along the sixth dimension of
1282  *    the 6D grid to process in one function call.
1283  * @param flags       a bitwise combination of zero or more optional flags
1284  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1285  */
1286 void pthreadpool_parallelize_6d(
1287 	pthreadpool_t threadpool,
1288 	pthreadpool_task_6d_t function,
1289 	void* context,
1290 	size_t range_i,
1291 	size_t range_j,
1292 	size_t range_k,
1293 	size_t range_l,
1294 	size_t range_m,
1295 	size_t range_n,
1296 	uint32_t flags);
1297 
1298 /**
1299  * Process items on a 6D grid with the specified maximum tile size along the
1300  * last grid dimension.
1301  *
1302  * The function implements a parallel version of the following snippet:
1303  *
1304  *   for (size_t i = 0; i < range_i; i++)
1305  *     for (size_t j = 0; j < range_j; j++)
1306  *       for (size_t k = 0; k < range_k; k++)
1307  *         for (size_t l = 0; l < range_l; l++)
1308  *           for (size_t m = 0; m < range_m; m++)
1309  *             for (size_t n = 0; n < range_n; n += tile_n)
1310  *               function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
1311  *
1312  * When the function returns, all items have been processed and the thread pool
1313  * is ready for a new task.
1314  *
1315  * @note If multiple threads call this function with the same thread pool, the
1316  *    calls are serialized.
1317  *
1318  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1319  *    is NULL, all items are processed serially on the calling thread.
1320  * @param function    the function to call for each tile.
1321  * @param context     the first argument passed to the specified function.
1322  * @param range_i     the number of items to process along the first dimension
1323  *    of the 6D grid.
1324  * @param range_j     the number of items to process along the second dimension
1325  *    of the 6D grid.
1326  * @param range_k     the number of items to process along the third dimension
1327  *    of the 6D grid.
1328  * @param range_l     the number of items to process along the fourth dimension
1329  *    of the 6D grid.
1330  * @param range_m     the number of items to process along the fifth dimension
1331  *    of the 6D grid.
1332  * @param range_n     the number of items to process along the sixth dimension
1333  *    of the 6D grid.
1334  * @param tile_n      the maximum number of items along the sixth dimension of
1335  *    the 6D grid to process in one function call.
1336  * @param flags       a bitwise combination of zero or more optional flags
1337  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1338  */
1339 void pthreadpool_parallelize_6d_tile_1d(
1340 	pthreadpool_t threadpool,
1341 	pthreadpool_task_6d_tile_1d_t function,
1342 	void* context,
1343 	size_t range_i,
1344 	size_t range_j,
1345 	size_t range_k,
1346 	size_t range_l,
1347 	size_t range_m,
1348 	size_t range_n,
1349 	size_t tile_n,
1350 	uint32_t flags);
1351 
1352 /**
1353  * Process items on a 6D grid with the specified maximum tile size along the
1354  * last two grid dimensions.
1355  *
1356  * The function implements a parallel version of the following snippet:
1357  *
1358  *   for (size_t i = 0; i < range_i; i++)
1359  *     for (size_t j = 0; j < range_j; j++)
1360  *       for (size_t k = 0; k < range_k; k++)
1361  *         for (size_t l = 0; l < range_l; l++)
1362  *           for (size_t m = 0; m < range_m; m += tile_m)
1363  *             for (size_t n = 0; n < range_n; n += tile_n)
1364  *               function(context, i, j, k, l, m, n,
1365  *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
1366  *
1367  * When the function returns, all items have been processed and the thread pool
1368  * is ready for a new task.
1369  *
1370  * @note If multiple threads call this function with the same thread pool, the
1371  *    calls are serialized.
1372  *
1373  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1374  *    is NULL, all items are processed serially on the calling thread.
1375  * @param function    the function to call for each tile.
1376  * @param context     the first argument passed to the specified function.
1377  * @param range_i     the number of items to process along the first dimension
1378  *    of the 6D grid.
1379  * @param range_j     the number of items to process along the second dimension
1380  *    of the 6D grid.
1381  * @param range_k     the number of items to process along the third dimension
1382  *    of the 6D grid.
1383  * @param range_l     the number of items to process along the fourth dimension
1384  *    of the 6D grid.
1385  * @param range_m     the number of items to process along the fifth dimension
1386  *    of the 6D grid.
1387  * @param range_n     the number of items to process along the sixth dimension
1388  *    of the 6D grid.
1389  * @param tile_m      the maximum number of items along the fifth dimension of
1390  *    the 6D grid to process in one function call.
1391  * @param tile_n      the maximum number of items along the sixth dimension of
1392  *    the 6D grid to process in one function call.
1393  * @param flags       a bitwise combination of zero or more optional flags
1394  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1395  */
1396 void pthreadpool_parallelize_6d_tile_2d(
1397 	pthreadpool_t threadpool,
1398 	pthreadpool_task_6d_tile_2d_t function,
1399 	void* context,
1400 	size_t range_i,
1401 	size_t range_j,
1402 	size_t range_k,
1403 	size_t range_l,
1404 	size_t range_m,
1405 	size_t range_n,
1406 	size_t tile_m,
1407 	size_t tile_n,
1408 	uint32_t flags);
1409 
1410 /**
1411  * Terminates threads in the thread pool and releases associated resources.
1412  *
1413  * @warning  Accessing the thread pool after a call to this function constitutes
1414  *    undefined behaviour and may cause data corruption.
1415  *
1416  * @param[in,out]  threadpool  The thread pool to destroy.
1417  */
1418 void pthreadpool_destroy(pthreadpool_t threadpool);
1419 
1420 #ifndef PTHREADPOOL_NO_DEPRECATED_API
1421 
1422 /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
1423 #if defined(__GNUC__)
1424 	#define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
1425 #else
1426 	#define PTHREADPOOL_DEPRECATED
1427 #endif
1428 
1429 typedef void (*pthreadpool_function_1d_t)(void*, size_t);
1430 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
1431 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
1432 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
1433 typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
1434 typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
1435 
1436 void pthreadpool_compute_1d(
1437 	pthreadpool_t threadpool,
1438 	pthreadpool_function_1d_t function,
1439 	void* argument,
1440 	size_t range) PTHREADPOOL_DEPRECATED;
1441 
1442 void pthreadpool_compute_1d_tiled(
1443 	pthreadpool_t threadpool,
1444 	pthreadpool_function_1d_tiled_t function,
1445 	void* argument,
1446 	size_t range,
1447 	size_t tile) PTHREADPOOL_DEPRECATED;
1448 
1449 void pthreadpool_compute_2d(
1450 	pthreadpool_t threadpool,
1451 	pthreadpool_function_2d_t function,
1452 	void* argument,
1453 	size_t range_i,
1454 	size_t range_j) PTHREADPOOL_DEPRECATED;
1455 
1456 void pthreadpool_compute_2d_tiled(
1457 	pthreadpool_t threadpool,
1458 	pthreadpool_function_2d_tiled_t function,
1459 	void* argument,
1460 	size_t range_i,
1461 	size_t range_j,
1462 	size_t tile_i,
1463 	size_t tile_j) PTHREADPOOL_DEPRECATED;
1464 
1465 void pthreadpool_compute_3d_tiled(
1466 	pthreadpool_t threadpool,
1467 	pthreadpool_function_3d_tiled_t function,
1468 	void* argument,
1469 	size_t range_i,
1470 	size_t range_j,
1471 	size_t range_k,
1472 	size_t tile_i,
1473 	size_t tile_j,
1474 	size_t tile_k) PTHREADPOOL_DEPRECATED;
1475 
1476 void pthreadpool_compute_4d_tiled(
1477 	pthreadpool_t threadpool,
1478 	pthreadpool_function_4d_tiled_t function,
1479 	void* argument,
1480 	size_t range_i,
1481 	size_t range_j,
1482 	size_t range_k,
1483 	size_t range_l,
1484 	size_t tile_i,
1485 	size_t tile_j,
1486 	size_t tile_k,
1487 	size_t tile_l) PTHREADPOOL_DEPRECATED;
1488 
1489 #endif /* PTHREADPOOL_NO_DEPRECATED_API */
1490 
1491 #ifdef __cplusplus
1492 } /* extern "C" */
1493 #endif
1494 
1495 #ifdef __cplusplus
1496 
1497 namespace libpthreadpool {
1498 namespace detail {
1499 namespace {
1500 
1501 template<class T>
call_wrapper_1d(void * arg,size_t i)1502 void call_wrapper_1d(void* arg, size_t i) {
1503 	(*static_cast<const T*>(arg))(i);
1504 }
1505 
1506 template<class T>
call_wrapper_1d_tile_1d(void * arg,size_t range_i,size_t tile_i)1507 void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) {
1508 	(*static_cast<const T*>(arg))(range_i, tile_i);
1509 }
1510 
1511 template<class T>
call_wrapper_2d(void * functor,size_t i,size_t j)1512 void call_wrapper_2d(void* functor, size_t i, size_t j) {
1513 	(*static_cast<const T*>(functor))(i, j);
1514 }
1515 
1516 template<class T>
call_wrapper_2d_tile_1d(void * functor,size_t i,size_t range_j,size_t tile_j)1517 void call_wrapper_2d_tile_1d(void* functor,
1518 		                         size_t i, size_t range_j, size_t tile_j)
1519 {
1520 	(*static_cast<const T*>(functor))(i, range_j, tile_j);
1521 }
1522 
1523 template<class T>
call_wrapper_2d_tile_2d(void * functor,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j)1524 void call_wrapper_2d_tile_2d(void* functor,
1525 		                         size_t range_i, size_t range_j,
1526 		                         size_t tile_i, size_t tile_j)
1527 {
1528 	(*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j);
1529 }
1530 
1531 template<class T>
call_wrapper_3d(void * functor,size_t i,size_t j,size_t k)1532 void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) {
1533 	(*static_cast<const T*>(functor))(i, j, k);
1534 }
1535 
1536 template<class T>
call_wrapper_3d_tile_1d(void * functor,size_t i,size_t j,size_t range_k,size_t tile_k)1537 void call_wrapper_3d_tile_1d(void* functor,
1538 		                         size_t i, size_t j, size_t range_k,
1539 		                         size_t tile_k)
1540 {
1541 	(*static_cast<const T*>(functor))(i, j, range_k, tile_k);
1542 }
1543 
1544 template<class T>
call_wrapper_3d_tile_2d(void * functor,size_t i,size_t range_j,size_t range_k,size_t tile_j,size_t tile_k)1545 void call_wrapper_3d_tile_2d(void* functor,
1546 		                         size_t i, size_t range_j, size_t range_k,
1547 		                         size_t tile_j, size_t tile_k)
1548 {
1549 	(*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k);
1550 }
1551 
1552 template<class T>
call_wrapper_4d(void * functor,size_t i,size_t j,size_t k,size_t l)1553 void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) {
1554 	(*static_cast<const T*>(functor))(i, j, k, l);
1555 }
1556 
1557 template<class T>
call_wrapper_4d_tile_1d(void * functor,size_t i,size_t j,size_t k,size_t range_l,size_t tile_l)1558 void call_wrapper_4d_tile_1d(void* functor,
1559 		                         size_t i, size_t j, size_t k, size_t range_l,
1560 		                         size_t tile_l)
1561 {
1562 	(*static_cast<const T*>(functor))(i, j, k, range_l, tile_l);
1563 }
1564 
1565 template<class T>
call_wrapper_4d_tile_2d(void * functor,size_t i,size_t j,size_t range_k,size_t range_l,size_t tile_k,size_t tile_l)1566 void call_wrapper_4d_tile_2d(void* functor,
1567 		                         size_t i, size_t j, size_t range_k, size_t range_l,
1568 		                         size_t tile_k, size_t tile_l)
1569 {
1570 	(*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l);
1571 }
1572 
1573 template<class T>
call_wrapper_5d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t m)1574 void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) {
1575 	(*static_cast<const T*>(functor))(i, j, k, l, m);
1576 }
1577 
1578 template<class T>
call_wrapper_5d_tile_1d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t range_m,size_t tile_m)1579 void call_wrapper_5d_tile_1d(void* functor,
1580 		                         size_t i, size_t j, size_t k, size_t l, size_t range_m,
1581 		                         size_t tile_m)
1582 {
1583 	(*static_cast<const T*>(functor))(i, j, k, l, range_m, tile_m);
1584 }
1585 
1586 template<class T>
call_wrapper_5d_tile_2d(void * functor,size_t i,size_t j,size_t k,size_t range_l,size_t range_m,size_t tile_l,size_t tile_m)1587 void call_wrapper_5d_tile_2d(void* functor,
1588 		                         size_t i, size_t j, size_t k, size_t range_l, size_t range_m,
1589 		                         size_t tile_l, size_t tile_m)
1590 {
1591 	(*static_cast<const T*>(functor))(i, j, k, range_l, range_m, tile_l, tile_m);
1592 }
1593 
1594 template<class T>
call_wrapper_6d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)1595 void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
1596 	(*static_cast<const T*>(functor))(i, j, k, l, m, n);
1597 }
1598 
1599 template<class T>
call_wrapper_6d_tile_1d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t m,size_t range_n,size_t tile_n)1600 void call_wrapper_6d_tile_1d(void* functor,
1601 		                         size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n,
1602 		                         size_t tile_n)
1603 {
1604 	(*static_cast<const T*>(functor))(i, j, k, l, m, range_n, tile_n);
1605 }
1606 
1607 template<class T>
call_wrapper_6d_tile_2d(void * functor,size_t i,size_t j,size_t k,size_t l,size_t range_m,size_t range_n,size_t tile_m,size_t tile_n)1608 void call_wrapper_6d_tile_2d(void* functor,
1609 		                         size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n,
1610 		                         size_t tile_m, size_t tile_n)
1611 {
1612 	(*static_cast<const T*>(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n);
1613 }
1614 
1615 }  /* namespace */
1616 }  /* namespace detail */
1617 }  /* namespace libpthreadpool */
1618 
1619 /**
1620  * Process items on a 1D grid.
1621  *
1622  * The function implements a parallel version of the following snippet:
1623  *
1624  *   for (size_t i = 0; i < range; i++)
1625  *     functor(i);
1626  *
1627  * When the function returns, all items have been processed and the thread pool
1628  * is ready for a new task.
1629  *
1630  * @note If multiple threads call this function with the same thread pool, the
1631  *    calls are serialized.
1632  *
1633  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1634  *    is NULL, all items are processed serially on the calling thread.
1635  * @param functor     the functor to call for each item.
1636  * @param range       the number of items on the 1D grid to process. The
1637  *    specified functor will be called once for each item.
1638  * @param flags       a bitwise combination of zero or more optional flags
1639  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1640  */
1641 template<class T>
1642 inline void pthreadpool_parallelize_1d(
1643 	pthreadpool_t threadpool,
1644 	const T& functor,
1645 	size_t range,
1646 	uint32_t flags = 0)
1647 {
1648 	pthreadpool_parallelize_1d(
1649 		threadpool,
1650 		&libpthreadpool::detail::call_wrapper_1d<const T>,
1651 		const_cast<void*>(static_cast<const void*>(&functor)),
1652 		range,
1653 		flags);
1654 }
1655 
1656 /**
1657  * Process items on a 1D grid with specified maximum tile size.
1658  *
1659  * The function implements a parallel version of the following snippet:
1660  *
1661  *   for (size_t i = 0; i < range; i += tile)
1662  *     functor(i, min(range - i, tile));
1663  *
1664  * When the call returns, all items have been processed and the thread pool is
1665  * ready for a new task.
1666  *
1667  * @note If multiple threads call this function with the same thread pool,
1668  *    the calls are serialized.
1669  *
1670  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1671  *    is NULL, all items are processed serially on the calling thread.
1672  * @param functor     the functor to call for each tile.
1673  * @param range       the number of items on the 1D grid to process.
1674  * @param tile        the maximum number of items on the 1D grid to process in
1675  *    one functor call.
1676  * @param flags       a bitwise combination of zero or more optional flags
1677  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1678  */
1679 template<class T>
1680 inline void pthreadpool_parallelize_1d_tile_1d(
1681 	pthreadpool_t threadpool,
1682 	const T& functor,
1683 	size_t range,
1684 	size_t tile,
1685 	uint32_t flags = 0)
1686 {
1687 	pthreadpool_parallelize_1d_tile_1d(
1688 		threadpool,
1689 		&libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>,
1690 		const_cast<void*>(static_cast<const void*>(&functor)),
1691 		range,
1692 		tile,
1693 		flags);
1694 }
1695 
1696 /**
1697  * Process items on a 2D grid.
1698  *
1699  * The function implements a parallel version of the following snippet:
1700  *
1701  *   for (size_t i = 0; i < range_i; i++)
1702  *     for (size_t j = 0; j < range_j; j++)
1703  *       functor(i, j);
1704  *
1705  * When the function returns, all items have been processed and the thread pool
1706  * is ready for a new task.
1707  *
1708  * @note If multiple threads call this function with the same thread pool, the
1709  *    calls are serialized.
1710  *
1711  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1712  *    is NULL, all items are processed serially on the calling thread.
1713  * @param functor     the functor to call for each item.
1714  * @param range_i     the number of items to process along the first dimension
1715  *    of the 2D grid.
1716  * @param range_j     the number of items to process along the second dimension
1717  *    of the 2D grid.
1718  * @param flags       a bitwise combination of zero or more optional flags
1719  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1720  */
1721 template<class T>
1722 inline void pthreadpool_parallelize_2d(
1723 	pthreadpool_t threadpool,
1724 	const T& functor,
1725 	size_t range_i,
1726 	size_t range_j,
1727 	uint32_t flags = 0)
1728 {
1729 	pthreadpool_parallelize_2d(
1730 		threadpool,
1731 		&libpthreadpool::detail::call_wrapper_2d<const T>,
1732 		const_cast<void*>(static_cast<const void*>(&functor)),
1733 		range_i,
1734 		range_j,
1735 		flags);
1736 }
1737 
1738 /**
1739  * Process items on a 2D grid with the specified maximum tile size along the
1740  * last grid dimension.
1741  *
1742  * The function implements a parallel version of the following snippet:
1743  *
1744  *   for (size_t i = 0; i < range_i; i++)
1745  *     for (size_t j = 0; j < range_j; j += tile_j)
1746  *       functor(i, j, min(range_j - j, tile_j));
1747  *
1748  * When the function returns, all items have been processed and the thread pool
1749  * is ready for a new task.
1750  *
1751  * @note If multiple threads call this function with the same thread pool, the
1752  *    calls are serialized.
1753  *
1754  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1755  *    is NULL, all items are processed serially on the calling thread.
1756  * @param functor     the functor to call for each tile.
1757  * @param range_i     the number of items to process along the first dimension
1758  *    of the 2D grid.
1759  * @param range_j     the number of items to process along the second dimension
1760  *    of the 2D grid.
1761  * @param tile_j      the maximum number of items along the second dimension of
1762  *    the 2D grid to process in one functor call.
1763  * @param flags       a bitwise combination of zero or more optional flags
1764  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1765  */
1766 template<class T>
1767 inline void pthreadpool_parallelize_2d_tile_1d(
1768 	pthreadpool_t threadpool,
1769 	const T& functor,
1770 	size_t range_i,
1771 	size_t range_j,
1772 	size_t tile_j,
1773 	uint32_t flags = 0)
1774 {
1775 	pthreadpool_parallelize_2d_tile_1d(
1776 		threadpool,
1777 		&libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>,
1778 		const_cast<void*>(static_cast<const void*>(&functor)),
1779 		range_i,
1780 		range_j,
1781 		tile_j,
1782 		flags);
1783 }
1784 
1785 /**
1786  * Process items on a 2D grid with the specified maximum tile size along each
1787  * grid dimension.
1788  *
1789  * The function implements a parallel version of the following snippet:
1790  *
1791  *   for (size_t i = 0; i < range_i; i += tile_i)
1792  *     for (size_t j = 0; j < range_j; j += tile_j)
1793  *       functor(i, j,
1794  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
1795  *
1796  * When the function returns, all items have been processed and the thread pool
1797  * is ready for a new task.
1798  *
1799  * @note If multiple threads call this function with the same thread pool, the
1800  *    calls are serialized.
1801  *
1802  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1803  *    is NULL, all items are processed serially on the calling thread.
1804  * @param functor     the functor to call for each tile.
1805  * @param range_i     the number of items to process along the first dimension
1806  *    of the 2D grid.
1807  * @param range_j     the number of items to process along the second dimension
1808  *    of the 2D grid.
1809  * @param tile_j      the maximum number of items along the first dimension of
1810  *    the 2D grid to process in one functor call.
1811  * @param tile_j      the maximum number of items along the second dimension of
1812  *    the 2D grid to process in one functor call.
1813  * @param flags       a bitwise combination of zero or more optional flags
1814  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1815  */
1816 template<class T>
1817 inline void pthreadpool_parallelize_2d_tile_2d(
1818 	pthreadpool_t threadpool,
1819 	const T& functor,
1820 	size_t range_i,
1821 	size_t range_j,
1822 	size_t tile_i,
1823 	size_t tile_j,
1824 	uint32_t flags = 0)
1825 {
1826 	pthreadpool_parallelize_2d_tile_2d(
1827 		threadpool,
1828 		&libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>,
1829 		const_cast<void*>(static_cast<const void*>(&functor)),
1830 		range_i,
1831 		range_j,
1832 		tile_i,
1833 		tile_j,
1834 		flags);
1835 }
1836 
1837 /**
1838  * Process items on a 3D grid.
1839  *
1840  * The function implements a parallel version of the following snippet:
1841  *
1842  *   for (size_t i = 0; i < range_i; i++)
1843  *     for (size_t j = 0; j < range_j; j++)
1844  *       for (size_t k = 0; k < range_k; k++)
1845  *         functor(i, j, k);
1846  *
1847  * When the function returns, all items have been processed and the thread pool
1848  * is ready for a new task.
1849  *
1850  * @note If multiple threads call this function with the same thread pool, the
1851  *    calls are serialized.
1852  *
1853  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1854  *    is NULL, all items are processed serially on the calling thread.
1855  * @param functor     the functor to call for each tile.
1856  * @param range_i     the number of items to process along the first dimension
1857  *    of the 3D grid.
1858  * @param range_j     the number of items to process along the second dimension
1859  *    of the 3D grid.
1860  * @param range_k     the number of items to process along the third dimension
1861  *    of the 3D grid.
1862  * @param flags       a bitwise combination of zero or more optional flags
1863  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1864  */
1865 template<class T>
1866 inline void pthreadpool_parallelize_3d(
1867 	pthreadpool_t threadpool,
1868 	const T& functor,
1869 	size_t range_i,
1870 	size_t range_j,
1871 	size_t range_k,
1872 	uint32_t flags = 0)
1873 {
1874 	pthreadpool_parallelize_3d(
1875 		threadpool,
1876 		&libpthreadpool::detail::call_wrapper_3d<const T>,
1877 		const_cast<void*>(static_cast<const void*>(&functor)),
1878 		range_i,
1879 		range_j,
1880 		range_k,
1881 		flags);
1882 }
1883 
1884 /**
1885  * Process items on a 3D grid with the specified maximum tile size along the
1886  * last grid dimension.
1887  *
1888  * The function implements a parallel version of the following snippet:
1889  *
1890  *   for (size_t i = 0; i < range_i; i++)
1891  *     for (size_t j = 0; j < range_j; j++)
1892  *       for (size_t k = 0; k < range_k; k += tile_k)
1893  *         functor(i, j, k, min(range_k - k, tile_k));
1894  *
1895  * When the function returns, all items have been processed and the thread pool
1896  * is ready for a new task.
1897  *
1898  * @note If multiple threads call this function with the same thread pool, the
1899  *    calls are serialized.
1900  *
1901  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1902  *    is NULL, all items are processed serially on the calling thread.
1903  * @param functor     the functor to call for each tile.
1904  * @param range_i     the number of items to process along the first dimension
1905  *    of the 3D grid.
1906  * @param range_j     the number of items to process along the second dimension
1907  *    of the 3D grid.
1908  * @param range_k     the number of items to process along the third dimension
1909  *    of the 3D grid.
1910  * @param tile_k      the maximum number of items along the third dimension of
1911  *    the 3D grid to process in one functor call.
1912  * @param flags       a bitwise combination of zero or more optional flags
1913  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1914  */
1915 template<class T>
1916 inline void pthreadpool_parallelize_3d_tile_1d(
1917 	pthreadpool_t threadpool,
1918 	const T& functor,
1919 	size_t range_i,
1920 	size_t range_j,
1921 	size_t range_k,
1922 	size_t tile_k,
1923 	uint32_t flags = 0)
1924 {
1925 	pthreadpool_parallelize_3d_tile_1d(
1926 		threadpool,
1927 		&libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>,
1928 		const_cast<void*>(static_cast<const void*>(&functor)),
1929 		range_i,
1930 		range_j,
1931 		range_k,
1932 		tile_k,
1933 		flags);
1934 }
1935 
1936 /**
1937  * Process items on a 3D grid with the specified maximum tile size along the
1938  * last two grid dimensions.
1939  *
1940  * The function implements a parallel version of the following snippet:
1941  *
1942  *   for (size_t i = 0; i < range_i; i++)
1943  *     for (size_t j = 0; j < range_j; j += tile_j)
1944  *       for (size_t k = 0; k < range_k; k += tile_k)
1945  *         functor(i, j, k,
1946  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
1947  *
1948  * When the function returns, all items have been processed and the thread pool
1949  * is ready for a new task.
1950  *
1951  * @note If multiple threads call this function with the same thread pool, the
1952  *    calls are serialized.
1953  *
1954  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1955  *    is NULL, all items are processed serially on the calling thread.
1956  * @param functor     the functor to call for each tile.
1957  * @param range_i     the number of items to process along the first dimension
1958  *    of the 3D grid.
1959  * @param range_j     the number of items to process along the second dimension
1960  *    of the 3D grid.
1961  * @param range_k     the number of items to process along the third dimension
1962  *    of the 3D grid.
1963  * @param tile_j      the maximum number of items along the second dimension of
1964  *    the 3D grid to process in one functor call.
1965  * @param tile_k      the maximum number of items along the third dimension of
1966  *    the 3D grid to process in one functor call.
1967  * @param flags       a bitwise combination of zero or more optional flags
1968  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1969  */
1970 template<class T>
1971 inline void pthreadpool_parallelize_3d_tile_2d(
1972 	pthreadpool_t threadpool,
1973 	const T& functor,
1974 	size_t range_i,
1975 	size_t range_j,
1976 	size_t range_k,
1977 	size_t tile_j,
1978 	size_t tile_k,
1979 	uint32_t flags = 0)
1980 {
1981 	pthreadpool_parallelize_3d_tile_2d(
1982 		threadpool,
1983 		&libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>,
1984 		const_cast<void*>(static_cast<const void*>(&functor)),
1985 		range_i,
1986 		range_j,
1987 		range_k,
1988 		tile_j,
1989 		tile_k,
1990 		flags);
1991 }
1992 
1993 /**
1994  * Process items on a 4D grid.
1995  *
1996  * The function implements a parallel version of the following snippet:
1997  *
1998  *   for (size_t i = 0; i < range_i; i++)
1999  *     for (size_t j = 0; j < range_j; j++)
2000  *       for (size_t k = 0; k < range_k; k++)
2001  *         for (size_t l = 0; l < range_l; l++)
2002  *           functor(i, j, k, l);
2003  *
2004  * When the function returns, all items have been processed and the thread pool
2005  * is ready for a new task.
2006  *
2007  * @note If multiple threads call this function with the same thread pool, the
2008  *    calls are serialized.
2009  *
2010  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2011  *    is NULL, all items are processed serially on the calling thread.
2012  * @param functor     the functor to call for each tile.
2013  * @param range_i     the number of items to process along the first dimension
2014  *    of the 4D grid.
2015  * @param range_j     the number of items to process along the second dimension
2016  *    of the 4D grid.
2017  * @param range_k     the number of items to process along the third dimension
2018  *    of the 4D grid.
2019  * @param range_l     the number of items to process along the fourth dimension
2020  *    of the 4D grid.
2021  * @param flags       a bitwise combination of zero or more optional flags
2022  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2023  */
2024 template<class T>
2025 inline void pthreadpool_parallelize_4d(
2026 	pthreadpool_t threadpool,
2027 	const T& functor,
2028 	size_t range_i,
2029 	size_t range_j,
2030 	size_t range_k,
2031 	size_t range_l,
2032 	uint32_t flags = 0)
2033 {
2034 	pthreadpool_parallelize_4d(
2035 		threadpool,
2036 		&libpthreadpool::detail::call_wrapper_4d<const T>,
2037 		const_cast<void*>(static_cast<const void*>(&functor)),
2038 		range_i,
2039 		range_j,
2040 		range_k,
2041 		range_l,
2042 		flags);
2043 }
2044 
2045 /**
2046  * Process items on a 4D grid with the specified maximum tile size along the
2047  * last grid dimension.
2048  *
2049  * The function implements a parallel version of the following snippet:
2050  *
2051  *   for (size_t i = 0; i < range_i; i++)
2052  *     for (size_t j = 0; j < range_j; j++)
2053  *       for (size_t k = 0; k < range_k; k++)
2054  *         for (size_t l = 0; l < range_l; l += tile_l)
2055  *           functor(i, j, k, l, min(range_l - l, tile_l));
2056  *
2057  * When the function returns, all items have been processed and the thread pool
2058  * is ready for a new task.
2059  *
2060  * @note If multiple threads call this function with the same thread pool, the
2061  *    calls are serialized.
2062  *
2063  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2064  *    is NULL, all items are processed serially on the calling thread.
2065  * @param functor     the functor to call for each tile.
2066  * @param range_i     the number of items to process along the first dimension
2067  *    of the 4D grid.
2068  * @param range_j     the number of items to process along the second dimension
2069  *    of the 4D grid.
2070  * @param range_k     the number of items to process along the third dimension
2071  *    of the 4D grid.
2072  * @param range_l     the number of items to process along the fourth dimension
2073  *    of the 4D grid.
2074  * @param tile_l      the maximum number of items along the fourth dimension of
2075  *    the 4D grid to process in one functor call.
2076  * @param flags       a bitwise combination of zero or more optional flags
2077  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2078  */
2079 template<class T>
2080 inline void pthreadpool_parallelize_4d_tile_1d(
2081 	pthreadpool_t threadpool,
2082 	const T& functor,
2083 	size_t range_i,
2084 	size_t range_j,
2085 	size_t range_k,
2086 	size_t range_l,
2087 	size_t tile_l,
2088 	uint32_t flags = 0)
2089 {
2090 	pthreadpool_parallelize_4d_tile_1d(
2091 		threadpool,
2092 		&libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>,
2093 		const_cast<void*>(static_cast<const void*>(&functor)),
2094 		range_i,
2095 		range_j,
2096 		range_k,
2097 		range_l,
2098 		tile_l,
2099 		flags);
2100 }
2101 
2102 /**
2103  * Process items on a 4D grid with the specified maximum tile size along the
2104  * last two grid dimensions.
2105  *
2106  * The function implements a parallel version of the following snippet:
2107  *
2108  *   for (size_t i = 0; i < range_i; i++)
2109  *     for (size_t j = 0; j < range_j; j++)
2110  *       for (size_t k = 0; k < range_k; k += tile_k)
2111  *         for (size_t l = 0; l < range_l; l += tile_l)
2112  *           functor(i, j, k, l,
2113  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
2114  *
2115  * When the function returns, all items have been processed and the thread pool
2116  * is ready for a new task.
2117  *
2118  * @note If multiple threads call this function with the same thread pool, the
2119  *    calls are serialized.
2120  *
2121  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2122  *    is NULL, all items are processed serially on the calling thread.
2123  * @param functor     the functor to call for each tile.
2124  * @param range_i     the number of items to process along the first dimension
2125  *    of the 4D grid.
2126  * @param range_j     the number of items to process along the second dimension
2127  *    of the 4D grid.
2128  * @param range_k     the number of items to process along the third dimension
2129  *    of the 4D grid.
2130  * @param range_l     the number of items to process along the fourth dimension
2131  *    of the 4D grid.
2132  * @param tile_k      the maximum number of items along the third dimension of
2133  *    the 4D grid to process in one functor call.
2134  * @param tile_l      the maximum number of items along the fourth dimension of
2135  *    the 4D grid to process in one functor call.
2136  * @param flags       a bitwise combination of zero or more optional flags
2137  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2138  */
2139 template<class T>
2140 inline void pthreadpool_parallelize_4d_tile_2d(
2141 	pthreadpool_t threadpool,
2142 	const T& functor,
2143 	size_t range_i,
2144 	size_t range_j,
2145 	size_t range_k,
2146 	size_t range_l,
2147 	size_t tile_k,
2148 	size_t tile_l,
2149 	uint32_t flags = 0)
2150 {
2151 	pthreadpool_parallelize_4d_tile_2d(
2152 		threadpool,
2153 		&libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>,
2154 		const_cast<void*>(static_cast<const void*>(&functor)),
2155 		range_i,
2156 		range_j,
2157 		range_k,
2158 		range_l,
2159 		tile_k,
2160 		tile_l,
2161 		flags);
2162 }
2163 
2164 /**
2165  * Process items on a 5D grid.
2166  *
2167  * The function implements a parallel version of the following snippet:
2168  *
2169  *   for (size_t i = 0; i < range_i; i++)
2170  *     for (size_t j = 0; j < range_j; j++)
2171  *       for (size_t k = 0; k < range_k; k++)
2172  *         for (size_t l = 0; l < range_l; l++)
2173  *           for (size_t m = 0; m < range_m; m++)
2174  *             functor(i, j, k, l, m);
2175  *
2176  * When the function returns, all items have been processed and the thread pool
2177  * is ready for a new task.
2178  *
2179  * @note If multiple threads call this function with the same thread pool, the
2180  *    calls are serialized.
2181  *
2182  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2183  *    is NULL, all items are processed serially on the calling thread.
2184  * @param functor     the functor to call for each tile.
2185  * @param range_i     the number of items to process along the first dimension
2186  *    of the 5D grid.
2187  * @param range_j     the number of items to process along the second dimension
2188  *    of the 5D grid.
2189  * @param range_k     the number of items to process along the third dimension
2190  *    of the 5D grid.
2191  * @param range_l     the number of items to process along the fourth dimension
2192  *    of the 5D grid.
2193  * @param range_m     the number of items to process along the fifth dimension
2194  *    of the 5D grid.
2195  * @param flags       a bitwise combination of zero or more optional flags
2196  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2197  */
2198 template<class T>
2199 inline void pthreadpool_parallelize_5d(
2200 	pthreadpool_t threadpool,
2201 	const T& functor,
2202 	size_t range_i,
2203 	size_t range_j,
2204 	size_t range_k,
2205 	size_t range_l,
2206 	size_t range_m,
2207 	uint32_t flags = 0)
2208 {
2209 	pthreadpool_parallelize_5d(
2210 		threadpool,
2211 		&libpthreadpool::detail::call_wrapper_5d<const T>,
2212 		const_cast<void*>(static_cast<const void*>(&functor)),
2213 		range_i,
2214 		range_j,
2215 		range_k,
2216 		range_l,
2217 		range_m,
2218 		flags);
2219 }
2220 
2221 /**
2222  * Process items on a 5D grid with the specified maximum tile size along the
2223  * last grid dimension.
2224  *
2225  * The function implements a parallel version of the following snippet:
2226  *
2227  *   for (size_t i = 0; i < range_i; i++)
2228  *     for (size_t j = 0; j < range_j; j++)
2229  *       for (size_t k = 0; k < range_k; k++)
2230  *         for (size_t l = 0; l < range_l; l++)
2231  *           for (size_t m = 0; m < range_m; m += tile_m)
2232  *             functor(i, j, k, l, m, min(range_m - m, tile_m));
2233  *
2234  * When the function returns, all items have been processed and the thread pool
2235  * is ready for a new task.
2236  *
2237  * @note If multiple threads call this function with the same thread pool, the
2238  *    calls are serialized.
2239  *
2240  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2241  *    is NULL, all items are processed serially on the calling thread.
2242  * @param functor     the functor to call for each tile.
2243  * @param range_i     the number of items to process along the first dimension
2244  *    of the 5D grid.
2245  * @param range_j     the number of items to process along the second dimension
2246  *    of the 5D grid.
2247  * @param range_k     the number of items to process along the third dimension
2248  *    of the 5D grid.
2249  * @param range_l     the number of items to process along the fourth dimension
2250  *    of the 5D grid.
2251  * @param range_m     the number of items to process along the fifth dimension
2252  *    of the 5D grid.
2253  * @param tile_m      the maximum number of items along the fifth dimension of
2254  *    the 5D grid to process in one functor call.
2255  * @param flags       a bitwise combination of zero or more optional flags
2256  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2257  */
2258 template<class T>
2259 inline void pthreadpool_parallelize_5d_tile_1d(
2260 	pthreadpool_t threadpool,
2261 	const T& functor,
2262 	size_t range_i,
2263 	size_t range_j,
2264 	size_t range_k,
2265 	size_t range_l,
2266 	size_t range_m,
2267 	size_t tile_m,
2268 	uint32_t flags = 0)
2269 {
2270 	pthreadpool_parallelize_5d_tile_1d(
2271 		threadpool,
2272 		&libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>,
2273 		const_cast<void*>(static_cast<const void*>(&functor)),
2274 		range_i,
2275 		range_j,
2276 		range_k,
2277 		range_l,
2278 		range_m,
2279 		tile_m,
2280 		flags);
2281 }
2282 
2283 /**
2284  * Process items on a 5D grid with the specified maximum tile size along the
2285  * last two grid dimensions.
2286  *
2287  * The function implements a parallel version of the following snippet:
2288  *
2289  *   for (size_t i = 0; i < range_i; i++)
2290  *     for (size_t j = 0; j < range_j; j++)
2291  *       for (size_t k = 0; k < range_k; k++)
2292  *         for (size_t l = 0; l < range_l; l += tile_l)
2293  *           for (size_t m = 0; m < range_m; m += tile_m)
2294  *             functor(i, j, k, l, m,
2295  *               min(range_l - l, tile_l), min(range_m - m, tile_m));
2296  *
2297  * When the function returns, all items have been processed and the thread pool
2298  * is ready for a new task.
2299  *
2300  * @note If multiple threads call this function with the same thread pool, the
2301  *    calls are serialized.
2302  *
2303  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2304  *    is NULL, all items are processed serially on the calling thread.
2305  * @param functor     the functor to call for each tile.
2306  * @param range_i     the number of items to process along the first dimension
2307  *    of the 5D grid.
2308  * @param range_j     the number of items to process along the second dimension
2309  *    of the 5D grid.
2310  * @param range_k     the number of items to process along the third dimension
2311  *    of the 5D grid.
2312  * @param range_l     the number of items to process along the fourth dimension
2313  *    of the 5D grid.
2314  * @param range_m     the number of items to process along the fifth dimension
2315  *    of the 5D grid.
2316  * @param tile_l      the maximum number of items along the fourth dimension of
2317  *    the 5D grid to process in one functor call.
2318  * @param tile_m      the maximum number of items along the fifth dimension of
2319  *    the 5D grid to process in one functor call.
2320  * @param flags       a bitwise combination of zero or more optional flags
2321  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2322  */
2323 template<class T>
2324 inline void pthreadpool_parallelize_5d_tile_2d(
2325 	pthreadpool_t threadpool,
2326 	const T& functor,
2327 	size_t range_i,
2328 	size_t range_j,
2329 	size_t range_k,
2330 	size_t range_l,
2331 	size_t range_m,
2332 	size_t tile_l,
2333 	size_t tile_m,
2334 	uint32_t flags = 0)
2335 {
2336 	pthreadpool_parallelize_5d_tile_2d(
2337 		threadpool,
2338 		&libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>,
2339 		const_cast<void*>(static_cast<const void*>(&functor)),
2340 		range_i,
2341 		range_j,
2342 		range_k,
2343 		range_l,
2344 		range_m,
2345 		tile_l,
2346 		tile_m,
2347 		flags);
2348 }
2349 
2350 /**
2351  * Process items on a 6D grid.
2352  *
2353  * The function implements a parallel version of the following snippet:
2354  *
2355  *   for (size_t i = 0; i < range_i; i++)
2356  *     for (size_t j = 0; j < range_j; j++)
2357  *       for (size_t k = 0; k < range_k; k++)
2358  *         for (size_t l = 0; l < range_l; l++)
2359  *           for (size_t m = 0; m < range_m; m++)
2360  *             for (size_t n = 0; n < range_n; n++)
2361  *               functor(i, j, k, l, m, n);
2362  *
2363  * When the function returns, all items have been processed and the thread pool
2364  * is ready for a new task.
2365  *
2366  * @note If multiple threads call this function with the same thread pool, the
2367  *    calls are serialized.
2368  *
2369  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2370  *    is NULL, all items are processed serially on the calling thread.
2371  * @param functor     the functor to call for each tile.
2372  * @param range_i     the number of items to process along the first dimension
2373  *    of the 6D grid.
2374  * @param range_j     the number of items to process along the second dimension
2375  *    of the 6D grid.
2376  * @param range_k     the number of items to process along the third dimension
2377  *    of the 6D grid.
2378  * @param range_l     the number of items to process along the fourth dimension
2379  *    of the 6D grid.
2380  * @param range_m     the number of items to process along the fifth dimension
2381  *    of the 6D grid.
2382  * @param range_n     the number of items to process along the sixth dimension
2383  *    of the 6D grid.
2384  * @param tile_n      the maximum number of items along the sixth dimension of
2385  *    the 6D grid to process in one functor call.
2386  * @param flags       a bitwise combination of zero or more optional flags
2387  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2388  */
2389 template<class T>
2390 inline void pthreadpool_parallelize_6d(
2391 	pthreadpool_t threadpool,
2392 	const T& functor,
2393 	size_t range_i,
2394 	size_t range_j,
2395 	size_t range_k,
2396 	size_t range_l,
2397 	size_t range_m,
2398 	size_t range_n,
2399 	uint32_t flags = 0)
2400 {
2401 	pthreadpool_parallelize_6d(
2402 		threadpool,
2403 		&libpthreadpool::detail::call_wrapper_6d<const T>,
2404 		const_cast<void*>(static_cast<const void*>(&functor)),
2405 		range_i,
2406 		range_j,
2407 		range_k,
2408 		range_l,
2409 		range_m,
2410 		range_n,
2411 		flags);
2412 }
2413 
2414 /**
2415  * Process items on a 6D grid with the specified maximum tile size along the
2416  * last grid dimension.
2417  *
2418  * The function implements a parallel version of the following snippet:
2419  *
2420  *   for (size_t i = 0; i < range_i; i++)
2421  *     for (size_t j = 0; j < range_j; j++)
2422  *       for (size_t k = 0; k < range_k; k++)
2423  *         for (size_t l = 0; l < range_l; l++)
2424  *           for (size_t m = 0; m < range_m; m++)
2425  *             for (size_t n = 0; n < range_n; n += tile_n)
2426  *               functor(i, j, k, l, m, n, min(range_n - n, tile_n));
2427  *
2428  * When the function returns, all items have been processed and the thread pool
2429  * is ready for a new task.
2430  *
2431  * @note If multiple threads call this function with the same thread pool, the
2432  *    calls are serialized.
2433  *
2434  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2435  *    is NULL, all items are processed serially on the calling thread.
2436  * @param functor     the functor to call for each tile.
2437  * @param range_i     the number of items to process along the first dimension
2438  *    of the 6D grid.
2439  * @param range_j     the number of items to process along the second dimension
2440  *    of the 6D grid.
2441  * @param range_k     the number of items to process along the third dimension
2442  *    of the 6D grid.
2443  * @param range_l     the number of items to process along the fourth dimension
2444  *    of the 6D grid.
2445  * @param range_m     the number of items to process along the fifth dimension
2446  *    of the 6D grid.
2447  * @param range_n     the number of items to process along the sixth dimension
2448  *    of the 6D grid.
2449  * @param tile_n      the maximum number of items along the sixth dimension of
2450  *    the 6D grid to process in one functor call.
2451  * @param flags       a bitwise combination of zero or more optional flags
2452  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2453  */
2454 template<class T>
2455 inline void pthreadpool_parallelize_6d_tile_1d(
2456 	pthreadpool_t threadpool,
2457 	const T& functor,
2458 	size_t range_i,
2459 	size_t range_j,
2460 	size_t range_k,
2461 	size_t range_l,
2462 	size_t range_m,
2463 	size_t range_n,
2464 	size_t tile_n,
2465 	uint32_t flags = 0)
2466 {
2467 	pthreadpool_parallelize_6d_tile_1d(
2468 		threadpool,
2469 		&libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>,
2470 		const_cast<void*>(static_cast<const void*>(&functor)),
2471 		range_i,
2472 		range_j,
2473 		range_k,
2474 		range_l,
2475 		range_m,
2476 		range_n,
2477 		tile_n,
2478 		flags);
2479 }
2480 
2481 /**
2482  * Process items on a 6D grid with the specified maximum tile size along the
2483  * last two grid dimensions.
2484  *
2485  * The function implements a parallel version of the following snippet:
2486  *
2487  *   for (size_t i = 0; i < range_i; i++)
2488  *     for (size_t j = 0; j < range_j; j++)
2489  *       for (size_t k = 0; k < range_k; k++)
2490  *         for (size_t l = 0; l < range_l; l++)
2491  *           for (size_t m = 0; m < range_m; m += tile_m)
2492  *             for (size_t n = 0; n < range_n; n += tile_n)
2493  *               functor(i, j, k, l, m, n,
2494  *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
2495  *
2496  * When the function returns, all items have been processed and the thread pool
2497  * is ready for a new task.
2498  *
2499  * @note If multiple threads call this function with the same thread pool, the
2500  *    calls are serialized.
2501  *
2502  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2503  *    is NULL, all items are processed serially on the calling thread.
2504  * @param functor     the functor to call for each tile.
2505  * @param range_i     the number of items to process along the first dimension
2506  *    of the 6D grid.
2507  * @param range_j     the number of items to process along the second dimension
2508  *    of the 6D grid.
2509  * @param range_k     the number of items to process along the third dimension
2510  *    of the 6D grid.
2511  * @param range_l     the number of items to process along the fourth dimension
2512  *    of the 6D grid.
2513  * @param range_m     the number of items to process along the fifth dimension
2514  *    of the 6D grid.
2515  * @param range_n     the number of items to process along the sixth dimension
2516  *    of the 6D grid.
2517  * @param tile_m      the maximum number of items along the fifth dimension of
2518  *    the 6D grid to process in one functor call.
2519  * @param tile_n      the maximum number of items along the sixth dimension of
2520  *    the 6D grid to process in one functor call.
2521  * @param flags       a bitwise combination of zero or more optional flags
2522  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2523  */
2524 template<class T>
2525 inline void pthreadpool_parallelize_6d_tile_2d(
2526 	pthreadpool_t threadpool,
2527 	const T& functor,
2528 	size_t range_i,
2529 	size_t range_j,
2530 	size_t range_k,
2531 	size_t range_l,
2532 	size_t range_m,
2533 	size_t range_n,
2534 	size_t tile_m,
2535 	size_t tile_n,
2536 	uint32_t flags = 0)
2537 {
2538 	pthreadpool_parallelize_6d_tile_2d(
2539 		threadpool,
2540 		&libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>,
2541 		const_cast<void*>(static_cast<const void*>(&functor)),
2542 		range_i,
2543 		range_j,
2544 		range_k,
2545 		range_l,
2546 		range_m,
2547 		range_n,
2548 		tile_m,
2549 		tile_n,
2550 		flags);
2551 }
2552 
2553 #endif  /* __cplusplus */
2554 
2555 #endif /* PTHREADPOOL_H_ */
2556