1 /* Standard C headers */
2 #include <stddef.h>
3
4 /* Dependencies */
5 #include <fxdiv.h>
6
7 /* Library header */
8 #include <pthreadpool.h>
9
10
divide_round_up(size_t dividend,size_t divisor)11 static inline size_t divide_round_up(size_t dividend, size_t divisor) {
12 if (dividend % divisor == 0) {
13 return dividend / divisor;
14 } else {
15 return dividend / divisor + 1;
16 }
17 }
18
min(size_t a,size_t b)19 static inline size_t min(size_t a, size_t b) {
20 return a < b ? a : b;
21 }
22
pthreadpool_compute_1d(pthreadpool_t threadpool,pthreadpool_function_1d_t function,void * argument,size_t range)23 void pthreadpool_compute_1d(
24 pthreadpool_t threadpool,
25 pthreadpool_function_1d_t function,
26 void* argument,
27 size_t range)
28 {
29 pthreadpool_parallelize_1d(threadpool,
30 (pthreadpool_task_1d_t) function, argument,
31 range, 0 /* flags */);
32 }
33
pthreadpool_compute_1d_tiled(pthreadpool_t threadpool,pthreadpool_function_1d_tiled_t function,void * argument,size_t range,size_t tile)34 void pthreadpool_compute_1d_tiled(
35 pthreadpool_t threadpool,
36 pthreadpool_function_1d_tiled_t function,
37 void* argument,
38 size_t range,
39 size_t tile)
40 {
41 pthreadpool_parallelize_1d_tile_1d(threadpool,
42 (pthreadpool_task_1d_tile_1d_t) function, argument,
43 range, tile, 0 /* flags */);
44 }
45
pthreadpool_compute_2d(pthreadpool_t threadpool,pthreadpool_function_2d_t function,void * argument,size_t range_i,size_t range_j)46 void pthreadpool_compute_2d(
47 pthreadpool_t threadpool,
48 pthreadpool_function_2d_t function,
49 void* argument,
50 size_t range_i,
51 size_t range_j)
52 {
53 pthreadpool_parallelize_2d(threadpool,
54 (pthreadpool_task_2d_t) function, argument,
55 range_i, range_j, 0 /* flags */);
56 }
57
pthreadpool_compute_2d_tiled(pthreadpool_t threadpool,pthreadpool_function_2d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j)58 void pthreadpool_compute_2d_tiled(
59 pthreadpool_t threadpool,
60 pthreadpool_function_2d_tiled_t function,
61 void* argument,
62 size_t range_i,
63 size_t range_j,
64 size_t tile_i,
65 size_t tile_j)
66 {
67 pthreadpool_parallelize_2d_tile_2d(threadpool,
68 (pthreadpool_task_2d_tile_2d_t) function, argument,
69 range_i, range_j, tile_i, tile_j, 0 /* flags */);
70 }
71
72 struct compute_3d_tiled_context {
73 pthreadpool_function_3d_tiled_t function;
74 void* argument;
75 struct fxdiv_divisor_size_t tile_range_j;
76 struct fxdiv_divisor_size_t tile_range_k;
77 size_t range_i;
78 size_t range_j;
79 size_t range_k;
80 size_t tile_i;
81 size_t tile_j;
82 size_t tile_k;
83 };
84
compute_3d_tiled(const struct compute_3d_tiled_context * context,size_t linear_index)85 static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) {
86 const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k;
87 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
88 const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
89 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
90 const size_t max_tile_i = context->tile_i;
91 const size_t max_tile_j = context->tile_j;
92 const size_t max_tile_k = context->tile_k;
93 const size_t index_i = tile_index_i_j.quotient * max_tile_i;
94 const size_t index_j = tile_index_i_j.remainder * max_tile_j;
95 const size_t index_k = tile_index_ij_k.remainder * max_tile_k;
96 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
97 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
98 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
99 context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
100 }
101
pthreadpool_compute_3d_tiled(pthreadpool_t threadpool,pthreadpool_function_3d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_i,size_t tile_j,size_t tile_k)102 void pthreadpool_compute_3d_tiled(
103 pthreadpool_t threadpool,
104 pthreadpool_function_3d_tiled_t function,
105 void* argument,
106 size_t range_i,
107 size_t range_j,
108 size_t range_k,
109 size_t tile_i,
110 size_t tile_j,
111 size_t tile_k)
112 {
113 if (pthreadpool_get_threads_count(threadpool) <= 1) {
114 /* No thread pool used: execute function sequentially on the calling thread */
115 for (size_t i = 0; i < range_i; i += tile_i) {
116 for (size_t j = 0; j < range_j; j += tile_j) {
117 for (size_t k = 0; k < range_k; k += tile_k) {
118 function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k));
119 }
120 }
121 }
122 } else {
123 /* Execute in parallel on the thread pool using linearized index */
124 const size_t tile_range_i = divide_round_up(range_i, tile_i);
125 const size_t tile_range_j = divide_round_up(range_j, tile_j);
126 const size_t tile_range_k = divide_round_up(range_k, tile_k);
127 struct compute_3d_tiled_context context = {
128 .function = function,
129 .argument = argument,
130 .tile_range_j = fxdiv_init_size_t(tile_range_j),
131 .tile_range_k = fxdiv_init_size_t(tile_range_k),
132 .range_i = range_i,
133 .range_j = range_j,
134 .range_k = range_k,
135 .tile_i = tile_i,
136 .tile_j = tile_j,
137 .tile_k = tile_k
138 };
139 pthreadpool_parallelize_1d(threadpool,
140 (pthreadpool_task_1d_t) compute_3d_tiled, &context,
141 tile_range_i * tile_range_j * tile_range_k,
142 0 /* flags */);
143 }
144 }
145
146 struct compute_4d_tiled_context {
147 pthreadpool_function_4d_tiled_t function;
148 void* argument;
149 struct fxdiv_divisor_size_t tile_range_kl;
150 struct fxdiv_divisor_size_t tile_range_j;
151 struct fxdiv_divisor_size_t tile_range_l;
152 size_t range_i;
153 size_t range_j;
154 size_t range_k;
155 size_t range_l;
156 size_t tile_i;
157 size_t tile_j;
158 size_t tile_k;
159 size_t tile_l;
160 };
161
compute_4d_tiled(const struct compute_4d_tiled_context * context,size_t linear_index)162 static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) {
163 const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl;
164 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
165 const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
166 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j);
167 const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l;
168 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
169 const size_t max_tile_i = context->tile_i;
170 const size_t max_tile_j = context->tile_j;
171 const size_t max_tile_k = context->tile_k;
172 const size_t max_tile_l = context->tile_l;
173 const size_t index_i = tile_index_i_j.quotient * max_tile_i;
174 const size_t index_j = tile_index_i_j.remainder * max_tile_j;
175 const size_t index_k = tile_index_k_l.quotient * max_tile_k;
176 const size_t index_l = tile_index_k_l.remainder * max_tile_l;
177 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
178 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
179 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
180 const size_t tile_l = min(max_tile_l, context->range_l - index_l);
181 context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l);
182 }
183
pthreadpool_compute_4d_tiled(pthreadpool_t threadpool,pthreadpool_function_4d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_i,size_t tile_j,size_t tile_k,size_t tile_l)184 void pthreadpool_compute_4d_tiled(
185 pthreadpool_t threadpool,
186 pthreadpool_function_4d_tiled_t function,
187 void* argument,
188 size_t range_i,
189 size_t range_j,
190 size_t range_k,
191 size_t range_l,
192 size_t tile_i,
193 size_t tile_j,
194 size_t tile_k,
195 size_t tile_l)
196 {
197 if (pthreadpool_get_threads_count(threadpool) <= 1) {
198 /* No thread pool used: execute function sequentially on the calling thread */
199 for (size_t i = 0; i < range_i; i += tile_i) {
200 for (size_t j = 0; j < range_j; j += tile_j) {
201 for (size_t k = 0; k < range_k; k += tile_k) {
202 for (size_t l = 0; l < range_l; l += tile_l) {
203 function(argument, i, j, k, l,
204 min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l));
205 }
206 }
207 }
208 }
209 } else {
210 /* Execute in parallel on the thread pool using linearized index */
211 const size_t tile_range_i = divide_round_up(range_i, tile_i);
212 const size_t tile_range_j = divide_round_up(range_j, tile_j);
213 const size_t tile_range_k = divide_round_up(range_k, tile_k);
214 const size_t tile_range_l = divide_round_up(range_l, tile_l);
215 struct compute_4d_tiled_context context = {
216 .function = function,
217 .argument = argument,
218 .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l),
219 .tile_range_j = fxdiv_init_size_t(tile_range_j),
220 .tile_range_l = fxdiv_init_size_t(tile_range_l),
221 .range_i = range_i,
222 .range_j = range_j,
223 .range_k = range_k,
224 .range_l = range_l,
225 .tile_i = tile_i,
226 .tile_j = tile_j,
227 .tile_k = tile_k,
228 .tile_l = tile_l
229 };
230 pthreadpool_parallelize_1d(threadpool,
231 (pthreadpool_task_1d_t) compute_4d_tiled, &context,
232 tile_range_i * tile_range_j * tile_range_k * tile_range_l,
233 0 /* flags */);
234 }
235 }
236