1 #ifndef PTHREADPOOL_H_ 2 #define PTHREADPOOL_H_ 3 4 #include <stddef.h> 5 #include <stdint.h> 6 7 typedef struct pthreadpool* pthreadpool_t; 8 9 typedef void (*pthreadpool_task_1d_t)(void*, size_t); 10 typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t); 11 typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t); 12 typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t); 13 typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t); 14 typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t); 15 typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t); 16 typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t); 17 typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t); 18 typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t); 19 typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); 20 typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t); 21 typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); 22 typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); 23 typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); 24 typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); 25 typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); 26 27 typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t); 28 typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t); 29 typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t); 30 typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t); 31 32 33 /** 34 * Disable support for denormalized numbers to the maximum extent possible for 35 * the duration of the computation. 36 * 37 * Handling denormalized floating-point numbers is often implemented in 38 * microcode, and incurs significant performance degradation. This hint 39 * instructs the thread pool to disable support for denormalized numbers before 40 * running the computation by manipulating architecture-specific control 41 * registers, and restore the initial value of control registers after the 42 * computation is complete. The thread pool temporary disables denormalized 43 * numbers on all threads involved in the computation (i.e. the caller threads, 44 * and potentially worker threads). 45 * 46 * Disabling denormalized numbers may have a small negative effect on results' 47 * accuracy. As various architectures differ in capabilities to control 48 * processing of denormalized numbers, using this flag may also hurt results' 49 * reproducibility across different instruction set architectures. 50 */ 51 #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001 52 53 /** 54 * Yield worker threads to the system scheduler after the operation is finished. 55 * 56 * Force workers to use kernel wait (instead of active spin-wait by default) for 57 * new commands after this command is processed. This flag affects only the 58 * immediate next operation on this thread pool. To make the thread pool always 59 * use kernel wait, pass this flag to all parallelization functions. 60 */ 61 #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002 62 63 #ifdef __cplusplus 64 extern "C" { 65 #endif 66 67 /** 68 * Create a thread pool with the specified number of threads. 69 * 70 * @param threads_count the number of threads in the thread pool. 71 * A value of 0 has special interpretation: it creates a thread pool with as 72 * many threads as there are logical processors in the system. 73 * 74 * @returns A pointer to an opaque thread pool object if the call is 75 * successful, or NULL pointer if the call failed. 76 */ 77 pthreadpool_t pthreadpool_create(size_t threads_count); 78 79 /** 80 * Query the number of threads in a thread pool. 81 * 82 * @param threadpool the thread pool to query. 83 * 84 * @returns The number of threads in the thread pool. 85 */ 86 size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); 87 88 /** 89 * Process items on a 1D grid. 90 * 91 * The function implements a parallel version of the following snippet: 92 * 93 * for (size_t i = 0; i < range; i++) 94 * function(context, i); 95 * 96 * When the function returns, all items have been processed and the thread pool 97 * is ready for a new task. 98 * 99 * @note If multiple threads call this function with the same thread pool, the 100 * calls are serialized. 101 * 102 * @param threadpool the thread pool to use for parallelisation. If threadpool 103 * is NULL, all items are processed serially on the calling thread. 104 * @param function the function to call for each item. 105 * @param context the first argument passed to the specified function. 106 * @param range the number of items on the 1D grid to process. The 107 * specified function will be called once for each item. 108 * @param flags a bitwise combination of zero or more optional flags 109 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 110 */ 111 void pthreadpool_parallelize_1d( 112 pthreadpool_t threadpool, 113 pthreadpool_task_1d_t function, 114 void* context, 115 size_t range, 116 uint32_t flags); 117 118 /** 119 * Process items on a 1D grid using a microarchitecture-aware task function. 120 * 121 * The function implements a parallel version of the following snippet: 122 * 123 * uint32_t uarch_index = cpuinfo_initialize() ? 124 * cpuinfo_get_current_uarch_index() : default_uarch_index; 125 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 126 * for (size_t i = 0; i < range; i++) 127 * function(context, uarch_index, i); 128 * 129 * When the function returns, all items have been processed and the thread pool 130 * is ready for a new task. 131 * 132 * @note If multiple threads call this function with the same thread pool, the 133 * calls are serialized. 134 * 135 * @param threadpool the thread pool to use for parallelisation. If 136 * threadpool is NULL, all items are processed serially on the calling 137 * thread. 138 * @param function the function to call for each item. 139 * @param context the first argument passed to the specified 140 * function. 141 * @param default_uarch_index the microarchitecture index to use when 142 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 143 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 144 * max_uarch_index value. 145 * @param max_uarch_index the maximum microarchitecture index expected by 146 * the specified function. If the index returned by 147 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 148 * will be used instead. default_uarch_index can exceed max_uarch_index. 149 * @param range the number of items on the 1D grid to process. 150 * The specified function will be called once for each item. 151 * @param flags a bitwise combination of zero or more optional 152 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 153 * PTHREADPOOL_FLAG_YIELD_WORKERS) 154 */ 155 void pthreadpool_parallelize_1d_with_uarch( 156 pthreadpool_t threadpool, 157 pthreadpool_task_1d_with_id_t function, 158 void* context, 159 uint32_t default_uarch_index, 160 uint32_t max_uarch_index, 161 size_t range, 162 uint32_t flags); 163 164 /** 165 * Process items on a 1D grid with specified maximum tile size. 166 * 167 * The function implements a parallel version of the following snippet: 168 * 169 * for (size_t i = 0; i < range; i += tile) 170 * function(context, i, min(range - i, tile)); 171 * 172 * When the call returns, all items have been processed and the thread pool is 173 * ready for a new task. 174 * 175 * @note If multiple threads call this function with the same thread pool, 176 * the calls are serialized. 177 * 178 * @param threadpool the thread pool to use for parallelisation. If threadpool 179 * is NULL, all items are processed serially on the calling thread. 180 * @param function the function to call for each tile. 181 * @param context the first argument passed to the specified function. 182 * @param range the number of items on the 1D grid to process. 183 * @param tile the maximum number of items on the 1D grid to process in 184 * one function call. 185 * @param flags a bitwise combination of zero or more optional flags 186 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 187 */ 188 void pthreadpool_parallelize_1d_tile_1d( 189 pthreadpool_t threadpool, 190 pthreadpool_task_1d_tile_1d_t function, 191 void* context, 192 size_t range, 193 size_t tile, 194 uint32_t flags); 195 196 /** 197 * Process items on a 2D grid. 198 * 199 * The function implements a parallel version of the following snippet: 200 * 201 * for (size_t i = 0; i < range_i; i++) 202 * for (size_t j = 0; j < range_j; j++) 203 * function(context, i, j); 204 * 205 * When the function returns, all items have been processed and the thread pool 206 * is ready for a new task. 207 * 208 * @note If multiple threads call this function with the same thread pool, the 209 * calls are serialized. 210 * 211 * @param threadpool the thread pool to use for parallelisation. If threadpool 212 * is NULL, all items are processed serially on the calling thread. 213 * @param function the function to call for each item. 214 * @param context the first argument passed to the specified function. 215 * @param range_i the number of items to process along the first dimension 216 * of the 2D grid. 217 * @param range_j the number of items to process along the second dimension 218 * of the 2D grid. 219 * @param flags a bitwise combination of zero or more optional flags 220 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 221 */ 222 void pthreadpool_parallelize_2d( 223 pthreadpool_t threadpool, 224 pthreadpool_task_2d_t function, 225 void* context, 226 size_t range_i, 227 size_t range_j, 228 uint32_t flags); 229 230 /** 231 * Process items on a 2D grid with the specified maximum tile size along the 232 * last grid dimension. 233 * 234 * The function implements a parallel version of the following snippet: 235 * 236 * for (size_t i = 0; i < range_i; i++) 237 * for (size_t j = 0; j < range_j; j += tile_j) 238 * function(context, i, j, min(range_j - j, tile_j)); 239 * 240 * When the function returns, all items have been processed and the thread pool 241 * is ready for a new task. 242 * 243 * @note If multiple threads call this function with the same thread pool, the 244 * calls are serialized. 245 * 246 * @param threadpool the thread pool to use for parallelisation. If threadpool 247 * is NULL, all items are processed serially on the calling thread. 248 * @param function the function to call for each tile. 249 * @param context the first argument passed to the specified function. 250 * @param range_i the number of items to process along the first dimension 251 * of the 2D grid. 252 * @param range_j the number of items to process along the second dimension 253 * of the 2D grid. 254 * @param tile_j the maximum number of items along the second dimension of 255 * the 2D grid to process in one function call. 256 * @param flags a bitwise combination of zero or more optional flags 257 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 258 */ 259 void pthreadpool_parallelize_2d_tile_1d( 260 pthreadpool_t threadpool, 261 pthreadpool_task_2d_tile_1d_t function, 262 void* context, 263 size_t range_i, 264 size_t range_j, 265 size_t tile_j, 266 uint32_t flags); 267 268 /** 269 * Process items on a 2D grid with the specified maximum tile size along each 270 * grid dimension. 271 * 272 * The function implements a parallel version of the following snippet: 273 * 274 * for (size_t i = 0; i < range_i; i += tile_i) 275 * for (size_t j = 0; j < range_j; j += tile_j) 276 * function(context, i, j, 277 * min(range_i - i, tile_i), min(range_j - j, tile_j)); 278 * 279 * When the function returns, all items have been processed and the thread pool 280 * is ready for a new task. 281 * 282 * @note If multiple threads call this function with the same thread pool, the 283 * calls are serialized. 284 * 285 * @param threadpool the thread pool to use for parallelisation. If threadpool 286 * is NULL, all items are processed serially on the calling thread. 287 * @param function the function to call for each tile. 288 * @param context the first argument passed to the specified function. 289 * @param range_i the number of items to process along the first dimension 290 * of the 2D grid. 291 * @param range_j the number of items to process along the second dimension 292 * of the 2D grid. 293 * @param tile_j the maximum number of items along the first dimension of 294 * the 2D grid to process in one function call. 295 * @param tile_j the maximum number of items along the second dimension of 296 * the 2D grid to process in one function call. 297 * @param flags a bitwise combination of zero or more optional flags 298 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 299 */ 300 void pthreadpool_parallelize_2d_tile_2d( 301 pthreadpool_t threadpool, 302 pthreadpool_task_2d_tile_2d_t function, 303 void* context, 304 size_t range_i, 305 size_t range_j, 306 size_t tile_i, 307 size_t tile_j, 308 uint32_t flags); 309 310 /** 311 * Process items on a 2D grid with the specified maximum tile size along each 312 * grid dimension using a microarchitecture-aware task function. 313 * 314 * The function implements a parallel version of the following snippet: 315 * 316 * uint32_t uarch_index = cpuinfo_initialize() ? 317 * cpuinfo_get_current_uarch_index() : default_uarch_index; 318 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 319 * for (size_t i = 0; i < range_i; i += tile_i) 320 * for (size_t j = 0; j < range_j; j += tile_j) 321 * function(context, uarch_index, i, j, 322 * min(range_i - i, tile_i), min(range_j - j, tile_j)); 323 * 324 * When the function returns, all items have been processed and the thread pool 325 * is ready for a new task. 326 * 327 * @note If multiple threads call this function with the same thread pool, the 328 * calls are serialized. 329 * 330 * @param threadpool the thread pool to use for parallelisation. If 331 * threadpool is NULL, all items are processed serially on the calling 332 * thread. 333 * @param function the function to call for each tile. 334 * @param context the first argument passed to the specified 335 * function. 336 * @param default_uarch_index the microarchitecture index to use when 337 * pthreadpool is configured without cpuinfo, 338 * cpuinfo initialization failed, or index returned 339 * by cpuinfo_get_current_uarch_index() exceeds 340 * the max_uarch_index value. 341 * @param max_uarch_index the maximum microarchitecture index expected 342 * by the specified function. If the index returned 343 * by cpuinfo_get_current_uarch_index() exceeds this 344 * value, default_uarch_index will be used instead. 345 * default_uarch_index can exceed max_uarch_index. 346 * @param range_i the number of items to process along the first 347 * dimension of the 2D grid. 348 * @param range_j the number of items to process along the second 349 * dimension of the 2D grid. 350 * @param tile_j the maximum number of items along the first 351 * dimension of the 2D grid to process in one function call. 352 * @param tile_j the maximum number of items along the second 353 * dimension of the 2D grid to process in one function call. 354 * @param flags a bitwise combination of zero or more optional 355 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 356 * PTHREADPOOL_FLAG_YIELD_WORKERS) 357 */ 358 void pthreadpool_parallelize_2d_tile_2d_with_uarch( 359 pthreadpool_t threadpool, 360 pthreadpool_task_2d_tile_2d_with_id_t function, 361 void* context, 362 uint32_t default_uarch_index, 363 uint32_t max_uarch_index, 364 size_t range_i, 365 size_t range_j, 366 size_t tile_i, 367 size_t tile_j, 368 uint32_t flags); 369 370 /** 371 * Process items on a 3D grid. 372 * 373 * The function implements a parallel version of the following snippet: 374 * 375 * for (size_t i = 0; i < range_i; i++) 376 * for (size_t j = 0; j < range_j; j++) 377 * for (size_t k = 0; k < range_k; k++) 378 * function(context, i, j, k); 379 * 380 * When the function returns, all items have been processed and the thread pool 381 * is ready for a new task. 382 * 383 * @note If multiple threads call this function with the same thread pool, the 384 * calls are serialized. 385 * 386 * @param threadpool the thread pool to use for parallelisation. If threadpool 387 * is NULL, all items are processed serially on the calling thread. 388 * @param function the function to call for each tile. 389 * @param context the first argument passed to the specified function. 390 * @param range_i the number of items to process along the first dimension 391 * of the 3D grid. 392 * @param range_j the number of items to process along the second dimension 393 * of the 3D grid. 394 * @param range_k the number of items to process along the third dimension 395 * of the 3D grid. 396 * @param flags a bitwise combination of zero or more optional flags 397 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 398 */ 399 void pthreadpool_parallelize_3d( 400 pthreadpool_t threadpool, 401 pthreadpool_task_3d_t function, 402 void* context, 403 size_t range_i, 404 size_t range_j, 405 size_t range_k, 406 uint32_t flags); 407 408 /** 409 * Process items on a 3D grid with the specified maximum tile size along the 410 * last grid dimension. 411 * 412 * The function implements a parallel version of the following snippet: 413 * 414 * for (size_t i = 0; i < range_i; i++) 415 * for (size_t j = 0; j < range_j; j++) 416 * for (size_t k = 0; k < range_k; k += tile_k) 417 * function(context, i, j, k, min(range_k - k, tile_k)); 418 * 419 * When the function returns, all items have been processed and the thread pool 420 * is ready for a new task. 421 * 422 * @note If multiple threads call this function with the same thread pool, the 423 * calls are serialized. 424 * 425 * @param threadpool the thread pool to use for parallelisation. If threadpool 426 * is NULL, all items are processed serially on the calling thread. 427 * @param function the function to call for each tile. 428 * @param context the first argument passed to the specified function. 429 * @param range_i the number of items to process along the first dimension 430 * of the 3D grid. 431 * @param range_j the number of items to process along the second dimension 432 * of the 3D grid. 433 * @param range_k the number of items to process along the third dimension 434 * of the 3D grid. 435 * @param tile_k the maximum number of items along the third dimension of 436 * the 3D grid to process in one function call. 437 * @param flags a bitwise combination of zero or more optional flags 438 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 439 */ 440 void pthreadpool_parallelize_3d_tile_1d( 441 pthreadpool_t threadpool, 442 pthreadpool_task_3d_tile_1d_t function, 443 void* context, 444 size_t range_i, 445 size_t range_j, 446 size_t range_k, 447 size_t tile_k, 448 uint32_t flags); 449 450 /** 451 * Process items on a 3D grid with the specified maximum tile size along the 452 * last two grid dimensions. 453 * 454 * The function implements a parallel version of the following snippet: 455 * 456 * for (size_t i = 0; i < range_i; i++) 457 * for (size_t j = 0; j < range_j; j += tile_j) 458 * for (size_t k = 0; k < range_k; k += tile_k) 459 * function(context, i, j, k, 460 * min(range_j - j, tile_j), min(range_k - k, tile_k)); 461 * 462 * When the function returns, all items have been processed and the thread pool 463 * is ready for a new task. 464 * 465 * @note If multiple threads call this function with the same thread pool, the 466 * calls are serialized. 467 * 468 * @param threadpool the thread pool to use for parallelisation. If threadpool 469 * is NULL, all items are processed serially on the calling thread. 470 * @param function the function to call for each tile. 471 * @param context the first argument passed to the specified function. 472 * @param range_i the number of items to process along the first dimension 473 * of the 3D grid. 474 * @param range_j the number of items to process along the second dimension 475 * of the 3D grid. 476 * @param range_k the number of items to process along the third dimension 477 * of the 3D grid. 478 * @param tile_j the maximum number of items along the second dimension of 479 * the 3D grid to process in one function call. 480 * @param tile_k the maximum number of items along the third dimension of 481 * the 3D grid to process in one function call. 482 * @param flags a bitwise combination of zero or more optional flags 483 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 484 */ 485 void pthreadpool_parallelize_3d_tile_2d( 486 pthreadpool_t threadpool, 487 pthreadpool_task_3d_tile_2d_t function, 488 void* context, 489 size_t range_i, 490 size_t range_j, 491 size_t range_k, 492 size_t tile_j, 493 size_t tile_k, 494 uint32_t flags); 495 496 /** 497 * Process items on a 3D grid with the specified maximum tile size along the 498 * last two grid dimensions using a microarchitecture-aware task function. 499 * 500 * The function implements a parallel version of the following snippet: 501 * 502 * uint32_t uarch_index = cpuinfo_initialize() ? 503 * cpuinfo_get_current_uarch_index() : default_uarch_index; 504 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 505 * for (size_t i = 0; i < range_i; i++) 506 * for (size_t j = 0; j < range_j; j += tile_j) 507 * for (size_t k = 0; k < range_k; k += tile_k) 508 * function(context, uarch_index, i, j, k, 509 * min(range_j - j, tile_j), min(range_k - k, tile_k)); 510 * 511 * When the function returns, all items have been processed and the thread pool 512 * is ready for a new task. 513 * 514 * @note If multiple threads call this function with the same thread pool, the 515 * calls are serialized. 516 * 517 * @param threadpool the thread pool to use for parallelisation. If 518 * threadpool is NULL, all items are processed serially on the calling 519 * thread. 520 * @param function the function to call for each tile. 521 * @param context the first argument passed to the specified 522 * function. 523 * @param default_uarch_index the microarchitecture index to use when 524 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 525 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 526 * max_uarch_index value. 527 * @param max_uarch_index the maximum microarchitecture index expected by 528 * the specified function. If the index returned by 529 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 530 * will be used instead. default_uarch_index can exceed max_uarch_index. 531 * @param range_i the number of items to process along the first 532 * dimension of the 3D grid. 533 * @param range_j the number of items to process along the second 534 * dimension of the 3D grid. 535 * @param range_k the number of items to process along the third 536 * dimension of the 3D grid. 537 * @param tile_j the maximum number of items along the second 538 * dimension of the 3D grid to process in one function call. 539 * @param tile_k the maximum number of items along the third 540 * dimension of the 3D grid to process in one function call. 541 * @param flags a bitwise combination of zero or more optional 542 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 543 * PTHREADPOOL_FLAG_YIELD_WORKERS) 544 */ 545 void pthreadpool_parallelize_3d_tile_2d_with_uarch( 546 pthreadpool_t threadpool, 547 pthreadpool_task_3d_tile_2d_with_id_t function, 548 void* context, 549 uint32_t default_uarch_index, 550 uint32_t max_uarch_index, 551 size_t range_i, 552 size_t range_j, 553 size_t range_k, 554 size_t tile_j, 555 size_t tile_k, 556 uint32_t flags); 557 558 /** 559 * Process items on a 4D grid. 560 * 561 * The function implements a parallel version of the following snippet: 562 * 563 * for (size_t i = 0; i < range_i; i++) 564 * for (size_t j = 0; j < range_j; j++) 565 * for (size_t k = 0; k < range_k; k++) 566 * for (size_t l = 0; l < range_l; l++) 567 * function(context, i, j, k, l); 568 * 569 * When the function returns, all items have been processed and the thread pool 570 * is ready for a new task. 571 * 572 * @note If multiple threads call this function with the same thread pool, the 573 * calls are serialized. 574 * 575 * @param threadpool the thread pool to use for parallelisation. If threadpool 576 * is NULL, all items are processed serially on the calling thread. 577 * @param function the function to call for each tile. 578 * @param context the first argument passed to the specified function. 579 * @param range_i the number of items to process along the first dimension 580 * of the 4D grid. 581 * @param range_j the number of items to process along the second dimension 582 * of the 4D grid. 583 * @param range_k the number of items to process along the third dimension 584 * of the 4D grid. 585 * @param range_l the number of items to process along the fourth dimension 586 * of the 4D grid. 587 * @param flags a bitwise combination of zero or more optional flags 588 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 589 */ 590 void pthreadpool_parallelize_4d( 591 pthreadpool_t threadpool, 592 pthreadpool_task_4d_t function, 593 void* context, 594 size_t range_i, 595 size_t range_j, 596 size_t range_k, 597 size_t range_l, 598 uint32_t flags); 599 600 /** 601 * Process items on a 4D grid with the specified maximum tile size along the 602 * last grid dimension. 603 * 604 * The function implements a parallel version of the following snippet: 605 * 606 * for (size_t i = 0; i < range_i; i++) 607 * for (size_t j = 0; j < range_j; j++) 608 * for (size_t k = 0; k < range_k; k++) 609 * for (size_t l = 0; l < range_l; l += tile_l) 610 * function(context, i, j, k, l, min(range_l - l, tile_l)); 611 * 612 * When the function returns, all items have been processed and the thread pool 613 * is ready for a new task. 614 * 615 * @note If multiple threads call this function with the same thread pool, the 616 * calls are serialized. 617 * 618 * @param threadpool the thread pool to use for parallelisation. If threadpool 619 * is NULL, all items are processed serially on the calling thread. 620 * @param function the function to call for each tile. 621 * @param context the first argument passed to the specified function. 622 * @param range_i the number of items to process along the first dimension 623 * of the 4D grid. 624 * @param range_j the number of items to process along the second dimension 625 * of the 4D grid. 626 * @param range_k the number of items to process along the third dimension 627 * of the 4D grid. 628 * @param range_l the number of items to process along the fourth dimension 629 * of the 4D grid. 630 * @param tile_l the maximum number of items along the fourth dimension of 631 * the 4D grid to process in one function call. 632 * @param flags a bitwise combination of zero or more optional flags 633 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 634 */ 635 void pthreadpool_parallelize_4d_tile_1d( 636 pthreadpool_t threadpool, 637 pthreadpool_task_4d_tile_1d_t function, 638 void* context, 639 size_t range_i, 640 size_t range_j, 641 size_t range_k, 642 size_t range_l, 643 size_t tile_l, 644 uint32_t flags); 645 646 /** 647 * Process items on a 4D grid with the specified maximum tile size along the 648 * last two grid dimensions. 649 * 650 * The function implements a parallel version of the following snippet: 651 * 652 * for (size_t i = 0; i < range_i; i++) 653 * for (size_t j = 0; j < range_j; j++) 654 * for (size_t k = 0; k < range_k; k += tile_k) 655 * for (size_t l = 0; l < range_l; l += tile_l) 656 * function(context, i, j, k, l, 657 * min(range_k - k, tile_k), min(range_l - l, tile_l)); 658 * 659 * When the function returns, all items have been processed and the thread pool 660 * is ready for a new task. 661 * 662 * @note If multiple threads call this function with the same thread pool, the 663 * calls are serialized. 664 * 665 * @param threadpool the thread pool to use for parallelisation. If threadpool 666 * is NULL, all items are processed serially on the calling thread. 667 * @param function the function to call for each tile. 668 * @param context the first argument passed to the specified function. 669 * @param range_i the number of items to process along the first dimension 670 * of the 4D grid. 671 * @param range_j the number of items to process along the second dimension 672 * of the 4D grid. 673 * @param range_k the number of items to process along the third dimension 674 * of the 4D grid. 675 * @param range_l the number of items to process along the fourth dimension 676 * of the 4D grid. 677 * @param tile_k the maximum number of items along the third dimension of 678 * the 4D grid to process in one function call. 679 * @param tile_l the maximum number of items along the fourth dimension of 680 * the 4D grid to process in one function call. 681 * @param flags a bitwise combination of zero or more optional flags 682 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 683 */ 684 void pthreadpool_parallelize_4d_tile_2d( 685 pthreadpool_t threadpool, 686 pthreadpool_task_4d_tile_2d_t function, 687 void* context, 688 size_t range_i, 689 size_t range_j, 690 size_t range_k, 691 size_t range_l, 692 size_t tile_k, 693 size_t tile_l, 694 uint32_t flags); 695 696 /** 697 * Process items on a 4D grid with the specified maximum tile size along the 698 * last two grid dimensions using a microarchitecture-aware task function. 699 * 700 * The function implements a parallel version of the following snippet: 701 * 702 * uint32_t uarch_index = cpuinfo_initialize() ? 703 * cpuinfo_get_current_uarch_index() : default_uarch_index; 704 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 705 * for (size_t i = 0; i < range_i; i++) 706 * for (size_t j = 0; j < range_j; j++) 707 * for (size_t k = 0; k < range_k; k += tile_k) 708 * for (size_t l = 0; l < range_l; l += tile_l) 709 * function(context, uarch_index, i, j, k, l, 710 * min(range_k - k, tile_k), min(range_l - l, tile_l)); 711 * 712 * When the function returns, all items have been processed and the thread pool 713 * is ready for a new task. 714 * 715 * @note If multiple threads call this function with the same thread pool, the 716 * calls are serialized. 717 * 718 * @param threadpool the thread pool to use for parallelisation. If 719 * threadpool is NULL, all items are processed serially on the calling 720 * thread. 721 * @param function the function to call for each tile. 722 * @param context the first argument passed to the specified 723 * function. 724 * @param default_uarch_index the microarchitecture index to use when 725 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 726 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 727 * max_uarch_index value. 728 * @param max_uarch_index the maximum microarchitecture index expected by 729 * the specified function. If the index returned by 730 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 731 * will be used instead. default_uarch_index can exceed max_uarch_index. 732 * @param range_i the number of items to process along the first 733 * dimension of the 4D grid. 734 * @param range_j the number of items to process along the second 735 * dimension of the 4D grid. 736 * @param range_k the number of items to process along the third 737 * dimension of the 4D grid. 738 * @param range_l the number of items to process along the fourth 739 * dimension of the 4D grid. 740 * @param tile_k the maximum number of items along the third 741 * dimension of the 4D grid to process in one function call. 742 * @param tile_l the maximum number of items along the fourth 743 * dimension of the 4D grid to process in one function call. 744 * @param flags a bitwise combination of zero or more optional 745 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 746 * PTHREADPOOL_FLAG_YIELD_WORKERS) 747 */ 748 void pthreadpool_parallelize_4d_tile_2d_with_uarch( 749 pthreadpool_t threadpool, 750 pthreadpool_task_4d_tile_2d_with_id_t function, 751 void* context, 752 uint32_t default_uarch_index, 753 uint32_t max_uarch_index, 754 size_t range_i, 755 size_t range_j, 756 size_t range_k, 757 size_t range_l, 758 size_t tile_k, 759 size_t tile_l, 760 uint32_t flags); 761 762 /** 763 * Process items on a 5D grid. 764 * 765 * The function implements a parallel version of the following snippet: 766 * 767 * for (size_t i = 0; i < range_i; i++) 768 * for (size_t j = 0; j < range_j; j++) 769 * for (size_t k = 0; k < range_k; k++) 770 * for (size_t l = 0; l < range_l; l++) 771 * for (size_t m = 0; m < range_m; m++) 772 * function(context, i, j, k, l, m); 773 * 774 * When the function returns, all items have been processed and the thread pool 775 * is ready for a new task. 776 * 777 * @note If multiple threads call this function with the same thread pool, the 778 * calls are serialized. 779 * 780 * @param threadpool the thread pool to use for parallelisation. If threadpool 781 * is NULL, all items are processed serially on the calling thread. 782 * @param function the function to call for each tile. 783 * @param context the first argument passed to the specified function. 784 * @param range_i the number of items to process along the first dimension 785 * of the 5D grid. 786 * @param range_j the number of items to process along the second dimension 787 * of the 5D grid. 788 * @param range_k the number of items to process along the third dimension 789 * of the 5D grid. 790 * @param range_l the number of items to process along the fourth dimension 791 * of the 5D grid. 792 * @param range_m the number of items to process along the fifth dimension 793 * of the 5D grid. 794 * @param flags a bitwise combination of zero or more optional flags 795 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 796 */ 797 void pthreadpool_parallelize_5d( 798 pthreadpool_t threadpool, 799 pthreadpool_task_5d_t function, 800 void* context, 801 size_t range_i, 802 size_t range_j, 803 size_t range_k, 804 size_t range_l, 805 size_t range_m, 806 uint32_t flags); 807 808 /** 809 * Process items on a 5D grid with the specified maximum tile size along the 810 * last grid dimension. 811 * 812 * The function implements a parallel version of the following snippet: 813 * 814 * for (size_t i = 0; i < range_i; i++) 815 * for (size_t j = 0; j < range_j; j++) 816 * for (size_t k = 0; k < range_k; k++) 817 * for (size_t l = 0; l < range_l; l++) 818 * for (size_t m = 0; m < range_m; m += tile_m) 819 * function(context, i, j, k, l, m, min(range_m - m, tile_m)); 820 * 821 * When the function returns, all items have been processed and the thread pool 822 * is ready for a new task. 823 * 824 * @note If multiple threads call this function with the same thread pool, the 825 * calls are serialized. 826 * 827 * @param threadpool the thread pool to use for parallelisation. If threadpool 828 * is NULL, all items are processed serially on the calling thread. 829 * @param function the function to call for each tile. 830 * @param context the first argument passed to the specified function. 831 * @param range_i the number of items to process along the first dimension 832 * of the 5D grid. 833 * @param range_j the number of items to process along the second dimension 834 * of the 5D grid. 835 * @param range_k the number of items to process along the third dimension 836 * of the 5D grid. 837 * @param range_l the number of items to process along the fourth dimension 838 * of the 5D grid. 839 * @param range_m the number of items to process along the fifth dimension 840 * of the 5D grid. 841 * @param tile_m the maximum number of items along the fifth dimension of 842 * the 5D grid to process in one function call. 843 * @param flags a bitwise combination of zero or more optional flags 844 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 845 */ 846 void pthreadpool_parallelize_5d_tile_1d( 847 pthreadpool_t threadpool, 848 pthreadpool_task_5d_tile_1d_t function, 849 void* context, 850 size_t range_i, 851 size_t range_j, 852 size_t range_k, 853 size_t range_l, 854 size_t range_m, 855 size_t tile_m, 856 uint32_t flags); 857 858 /** 859 * Process items on a 5D grid with the specified maximum tile size along the 860 * last two grid dimensions. 861 * 862 * The function implements a parallel version of the following snippet: 863 * 864 * for (size_t i = 0; i < range_i; i++) 865 * for (size_t j = 0; j < range_j; j++) 866 * for (size_t k = 0; k < range_k; k++) 867 * for (size_t l = 0; l < range_l; l += tile_l) 868 * for (size_t m = 0; m < range_m; m += tile_m) 869 * function(context, i, j, k, l, m, 870 * min(range_l - l, tile_l), min(range_m - m, tile_m)); 871 * 872 * When the function returns, all items have been processed and the thread pool 873 * is ready for a new task. 874 * 875 * @note If multiple threads call this function with the same thread pool, the 876 * calls are serialized. 877 * 878 * @param threadpool the thread pool to use for parallelisation. If threadpool 879 * is NULL, all items are processed serially on the calling thread. 880 * @param function the function to call for each tile. 881 * @param context the first argument passed to the specified function. 882 * @param range_i the number of items to process along the first dimension 883 * of the 5D grid. 884 * @param range_j the number of items to process along the second dimension 885 * of the 5D grid. 886 * @param range_k the number of items to process along the third dimension 887 * of the 5D grid. 888 * @param range_l the number of items to process along the fourth dimension 889 * of the 5D grid. 890 * @param range_m the number of items to process along the fifth dimension 891 * of the 5D grid. 892 * @param tile_l the maximum number of items along the fourth dimension of 893 * the 5D grid to process in one function call. 894 * @param tile_m the maximum number of items along the fifth dimension of 895 * the 5D grid to process in one function call. 896 * @param flags a bitwise combination of zero or more optional flags 897 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 898 */ 899 void pthreadpool_parallelize_5d_tile_2d( 900 pthreadpool_t threadpool, 901 pthreadpool_task_5d_tile_2d_t function, 902 void* context, 903 size_t range_i, 904 size_t range_j, 905 size_t range_k, 906 size_t range_l, 907 size_t range_m, 908 size_t tile_l, 909 size_t tile_m, 910 uint32_t flags); 911 912 /** 913 * Process items on a 6D grid. 914 * 915 * The function implements a parallel version of the following snippet: 916 * 917 * for (size_t i = 0; i < range_i; i++) 918 * for (size_t j = 0; j < range_j; j++) 919 * for (size_t k = 0; k < range_k; k++) 920 * for (size_t l = 0; l < range_l; l++) 921 * for (size_t m = 0; m < range_m; m++) 922 * for (size_t n = 0; n < range_n; n++) 923 * function(context, i, j, k, l, m, n); 924 * 925 * When the function returns, all items have been processed and the thread pool 926 * is ready for a new task. 927 * 928 * @note If multiple threads call this function with the same thread pool, the 929 * calls are serialized. 930 * 931 * @param threadpool the thread pool to use for parallelisation. If threadpool 932 * is NULL, all items are processed serially on the calling thread. 933 * @param function the function to call for each tile. 934 * @param context the first argument passed to the specified function. 935 * @param range_i the number of items to process along the first dimension 936 * of the 6D grid. 937 * @param range_j the number of items to process along the second dimension 938 * of the 6D grid. 939 * @param range_k the number of items to process along the third dimension 940 * of the 6D grid. 941 * @param range_l the number of items to process along the fourth dimension 942 * of the 6D grid. 943 * @param range_m the number of items to process along the fifth dimension 944 * of the 6D grid. 945 * @param range_n the number of items to process along the sixth dimension 946 * of the 6D grid. 947 * @param tile_n the maximum number of items along the sixth dimension of 948 * the 6D grid to process in one function call. 949 * @param flags a bitwise combination of zero or more optional flags 950 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 951 */ 952 void pthreadpool_parallelize_6d( 953 pthreadpool_t threadpool, 954 pthreadpool_task_6d_t function, 955 void* context, 956 size_t range_i, 957 size_t range_j, 958 size_t range_k, 959 size_t range_l, 960 size_t range_m, 961 size_t range_n, 962 uint32_t flags); 963 964 /** 965 * Process items on a 6D grid with the specified maximum tile size along the 966 * last grid dimension. 967 * 968 * The function implements a parallel version of the following snippet: 969 * 970 * for (size_t i = 0; i < range_i; i++) 971 * for (size_t j = 0; j < range_j; j++) 972 * for (size_t k = 0; k < range_k; k++) 973 * for (size_t l = 0; l < range_l; l++) 974 * for (size_t m = 0; m < range_m; m++) 975 * for (size_t n = 0; n < range_n; n += tile_n) 976 * function(context, i, j, k, l, m, n, min(range_n - n, tile_n)); 977 * 978 * When the function returns, all items have been processed and the thread pool 979 * is ready for a new task. 980 * 981 * @note If multiple threads call this function with the same thread pool, the 982 * calls are serialized. 983 * 984 * @param threadpool the thread pool to use for parallelisation. If threadpool 985 * is NULL, all items are processed serially on the calling thread. 986 * @param function the function to call for each tile. 987 * @param context the first argument passed to the specified function. 988 * @param range_i the number of items to process along the first dimension 989 * of the 6D grid. 990 * @param range_j the number of items to process along the second dimension 991 * of the 6D grid. 992 * @param range_k the number of items to process along the third dimension 993 * of the 6D grid. 994 * @param range_l the number of items to process along the fourth dimension 995 * of the 6D grid. 996 * @param range_m the number of items to process along the fifth dimension 997 * of the 6D grid. 998 * @param range_n the number of items to process along the sixth dimension 999 * of the 6D grid. 1000 * @param tile_n the maximum number of items along the sixth dimension of 1001 * the 6D grid to process in one function call. 1002 * @param flags a bitwise combination of zero or more optional flags 1003 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1004 */ 1005 void pthreadpool_parallelize_6d_tile_1d( 1006 pthreadpool_t threadpool, 1007 pthreadpool_task_6d_tile_1d_t function, 1008 void* context, 1009 size_t range_i, 1010 size_t range_j, 1011 size_t range_k, 1012 size_t range_l, 1013 size_t range_m, 1014 size_t range_n, 1015 size_t tile_n, 1016 uint32_t flags); 1017 1018 /** 1019 * Process items on a 6D grid with the specified maximum tile size along the 1020 * last two grid dimensions. 1021 * 1022 * The function implements a parallel version of the following snippet: 1023 * 1024 * for (size_t i = 0; i < range_i; i++) 1025 * for (size_t j = 0; j < range_j; j++) 1026 * for (size_t k = 0; k < range_k; k++) 1027 * for (size_t l = 0; l < range_l; l++) 1028 * for (size_t m = 0; m < range_m; m += tile_m) 1029 * for (size_t n = 0; n < range_n; n += tile_n) 1030 * function(context, i, j, k, l, m, n, 1031 * min(range_m - m, tile_m), min(range_n - n, tile_n)); 1032 * 1033 * When the function returns, all items have been processed and the thread pool 1034 * is ready for a new task. 1035 * 1036 * @note If multiple threads call this function with the same thread pool, the 1037 * calls are serialized. 1038 * 1039 * @param threadpool the thread pool to use for parallelisation. If threadpool 1040 * is NULL, all items are processed serially on the calling thread. 1041 * @param function the function to call for each tile. 1042 * @param context the first argument passed to the specified function. 1043 * @param range_i the number of items to process along the first dimension 1044 * of the 6D grid. 1045 * @param range_j the number of items to process along the second dimension 1046 * of the 6D grid. 1047 * @param range_k the number of items to process along the third dimension 1048 * of the 6D grid. 1049 * @param range_l the number of items to process along the fourth dimension 1050 * of the 6D grid. 1051 * @param range_m the number of items to process along the fifth dimension 1052 * of the 6D grid. 1053 * @param range_n the number of items to process along the sixth dimension 1054 * of the 6D grid. 1055 * @param tile_m the maximum number of items along the fifth dimension of 1056 * the 6D grid to process in one function call. 1057 * @param tile_n the maximum number of items along the sixth dimension of 1058 * the 6D grid to process in one function call. 1059 * @param flags a bitwise combination of zero or more optional flags 1060 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1061 */ 1062 void pthreadpool_parallelize_6d_tile_2d( 1063 pthreadpool_t threadpool, 1064 pthreadpool_task_6d_tile_2d_t function, 1065 void* context, 1066 size_t range_i, 1067 size_t range_j, 1068 size_t range_k, 1069 size_t range_l, 1070 size_t range_m, 1071 size_t range_n, 1072 size_t tile_m, 1073 size_t tile_n, 1074 uint32_t flags); 1075 1076 /** 1077 * Terminates threads in the thread pool and releases associated resources. 1078 * 1079 * @warning Accessing the thread pool after a call to this function constitutes 1080 * undefined behaviour and may cause data corruption. 1081 * 1082 * @param[in,out] threadpool The thread pool to destroy. 1083 */ 1084 void pthreadpool_destroy(pthreadpool_t threadpool); 1085 1086 1087 #ifndef PTHREADPOOL_NO_DEPRECATED_API 1088 1089 /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */ 1090 #if defined(__GNUC__) 1091 #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__)) 1092 #else 1093 #define PTHREADPOOL_DEPRECATED 1094 #endif 1095 1096 typedef void (*pthreadpool_function_1d_t)(void*, size_t) PTHREADPOOL_DEPRECATED; 1097 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED; 1098 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED; 1099 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED; 1100 typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED; 1101 typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED; 1102 1103 void pthreadpool_compute_1d( 1104 pthreadpool_t threadpool, 1105 pthreadpool_function_1d_t function, 1106 void* argument, 1107 size_t range) PTHREADPOOL_DEPRECATED; 1108 1109 void pthreadpool_compute_1d_tiled( 1110 pthreadpool_t threadpool, 1111 pthreadpool_function_1d_tiled_t function, 1112 void* argument, 1113 size_t range, 1114 size_t tile) PTHREADPOOL_DEPRECATED; 1115 1116 void pthreadpool_compute_2d( 1117 pthreadpool_t threadpool, 1118 pthreadpool_function_2d_t function, 1119 void* argument, 1120 size_t range_i, 1121 size_t range_j) PTHREADPOOL_DEPRECATED; 1122 1123 void pthreadpool_compute_2d_tiled( 1124 pthreadpool_t threadpool, 1125 pthreadpool_function_2d_tiled_t function, 1126 void* argument, 1127 size_t range_i, 1128 size_t range_j, 1129 size_t tile_i, 1130 size_t tile_j) PTHREADPOOL_DEPRECATED; 1131 1132 void pthreadpool_compute_3d_tiled( 1133 pthreadpool_t threadpool, 1134 pthreadpool_function_3d_tiled_t function, 1135 void* argument, 1136 size_t range_i, 1137 size_t range_j, 1138 size_t range_k, 1139 size_t tile_i, 1140 size_t tile_j, 1141 size_t tile_k) PTHREADPOOL_DEPRECATED; 1142 1143 void pthreadpool_compute_4d_tiled( 1144 pthreadpool_t threadpool, 1145 pthreadpool_function_4d_tiled_t function, 1146 void* argument, 1147 size_t range_i, 1148 size_t range_j, 1149 size_t range_k, 1150 size_t range_l, 1151 size_t tile_i, 1152 size_t tile_j, 1153 size_t tile_k, 1154 size_t tile_l) PTHREADPOOL_DEPRECATED; 1155 1156 #endif /* PTHREADPOOL_NO_DEPRECATED_API */ 1157 1158 #ifdef __cplusplus 1159 } /* extern "C" */ 1160 #endif 1161 1162 #endif /* PTHREADPOOL_H_ */ 1163