1 #pragma once
2 #ifndef FXDIV_H
3 #define FXDIV_H
4
5 #if defined(__cplusplus) && (__cplusplus >= 201103L)
6 #include <cstddef>
7 #include <cstdint>
8 #include <climits>
9 #elif !defined(__OPENCL_VERSION__)
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <limits.h>
13 #endif
14
15 #if defined(_MSC_VER)
16 #include <intrin.h>
17 #endif
18
19 #ifndef FXDIV_USE_INLINE_ASSEMBLY
20 #define FXDIV_USE_INLINE_ASSEMBLY 1
21 #endif
22
fxdiv_mulext_uint32_t(uint32_t a,uint32_t b)23 static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
24 #if defined(_MSC_VER) && defined(_M_IX86)
25 return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
26 #else
27 return (uint64_t) a * (uint64_t) b;
28 #endif
29 }
30
fxdiv_mulhi_uint32_t(uint32_t a,uint32_t b)31 static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
32 #if defined(__OPENCL_VERSION__)
33 return mul_hi(a, b);
34 #elif defined(__CUDA_ARCH__)
35 return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
36 #elif defined(_MSC_VER) && defined(_M_IX86)
37 return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
38 #elif defined(_MSC_VER) && defined(_M_ARM)
39 return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
40 #else
41 return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
42 #endif
43 }
44
fxdiv_mulhi_uint64_t(uint64_t a,uint64_t b)45 static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
46 #if defined(__OPENCL_VERSION__)
47 return mul_hi(a, b);
48 #elif defined(__CUDA_ARCH__)
49 return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
50 #elif defined(_MSC_VER) && defined(_M_X64)
51 return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
52 #elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
53 return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
54 #else
55 const uint32_t a_lo = (uint32_t) a;
56 const uint32_t a_hi = (uint32_t) (a >> 32);
57 const uint32_t b_lo = (uint32_t) b;
58 const uint32_t b_hi = (uint32_t) (b >> 32);
59
60 const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
61 (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
62 return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
63 ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
64 #endif
65 }
66
fxdiv_mulhi_size_t(size_t a,size_t b)67 static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
68 #if SIZE_MAX == UINT32_MAX
69 return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
70 #elif SIZE_MAX == UINT64_MAX
71 return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
72 #else
73 #error Unsupported platform
74 #endif
75 }
76
77 struct fxdiv_divisor_uint32_t {
78 uint32_t value;
79 uint32_t m;
80 uint8_t s1;
81 uint8_t s2;
82 };
83
84 struct fxdiv_result_uint32_t {
85 uint32_t quotient;
86 uint32_t remainder;
87 };
88
89 struct fxdiv_divisor_uint64_t {
90 uint64_t value;
91 uint64_t m;
92 uint8_t s1;
93 uint8_t s2;
94 };
95
96 struct fxdiv_result_uint64_t {
97 uint64_t quotient;
98 uint64_t remainder;
99 };
100
101 struct fxdiv_divisor_size_t {
102 size_t value;
103 size_t m;
104 uint8_t s1;
105 uint8_t s2;
106 };
107
108 struct fxdiv_result_size_t {
109 size_t quotient;
110 size_t remainder;
111 };
112
fxdiv_init_uint32_t(uint32_t d)113 static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
114 struct fxdiv_divisor_uint32_t result = { d };
115 if (d == 1) {
116 result.m = UINT32_C(1);
117 result.s1 = 0;
118 result.s2 = 0;
119 } else {
120 #if defined(__OPENCL_VERSION__)
121 const uint32_t l_minus_1 = 31 - clz(d - 1);
122 #elif defined(__CUDA_ARCH__)
123 const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
124 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
125 unsigned long l_minus_1;
126 _BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
127 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
128 uint32_t l_minus_1;
129 __asm__("BSRL %[d_minus_1], %[l_minus_1]"
130 : [l_minus_1] "=r" (l_minus_1)
131 : [d_minus_1] "r" (d - 1));
132 #elif defined(__GNUC__)
133 const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
134 #else
135 /* Based on Algorithm 2 from Hacker's delight */
136
137 uint32_t l_minus_1 = 0;
138 uint32_t x = d - 1;
139 uint32_t y = x >> 16;
140 if (y != 0) {
141 l_minus_1 += 16;
142 x = y;
143 }
144 y = x >> 8;
145 if (y != 0) {
146 l_minus_1 += 8;
147 x = y;
148 }
149 y = x >> 4;
150 if (y != 0) {
151 l_minus_1 += 4;
152 x = y;
153 }
154 y = x >> 2;
155 if (y != 0) {
156 l_minus_1 += 2;
157 x = y;
158 }
159 if ((x & 2) != 0) {
160 l_minus_1 += 1;
161 }
162 #endif
163 uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
164
165 /* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
166 #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
167 uint32_t q;
168 __asm__("DIVL %[d]"
169 : "=a" (q), "+d" (u_hi)
170 : [d] "r" (d), "a" (0));
171 #else
172 const uint32_t q = ((uint64_t) u_hi << 32) / d;
173 #endif
174
175 result.m = q + UINT32_C(1);
176 result.s1 = 1;
177 result.s2 = (uint8_t) l_minus_1;
178 }
179 return result;
180 }
181
fxdiv_init_uint64_t(uint64_t d)182 static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
183 struct fxdiv_divisor_uint64_t result = { d };
184 if (d == 1) {
185 result.m = UINT64_C(1);
186 result.s1 = 0;
187 result.s2 = 0;
188 } else {
189 #if defined(__OPENCL_VERSION__)
190 const uint32_t nlz_d = clz(d);
191 const uint32_t l_minus_1 = 63 - clz(d - 1);
192 #elif defined(__CUDA_ARCH__)
193 const uint32_t nlz_d = __clzll((long long) d);
194 const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
195 #elif defined(_MSC_VER) && defined(_M_X64)
196 unsigned long l_minus_1;
197 _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
198 unsigned long bsr_d;
199 _BitScanReverse64(&bsr_d, (unsigned __int64) d);
200 const uint32_t nlz_d = bsr_d ^ 0x3F;
201 #elif defined(_MSC_VER) && defined(_M_IX86)
202 const uint64_t d_minus_1 = d - 1;
203 const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
204 unsigned long l_minus_1;
205 if ((uint32_t) (d_minus_1 >> 32) == 0) {
206 _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
207 } else {
208 _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
209 l_minus_1 += 32;
210 }
211 const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
212 #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
213 uint64_t l_minus_1;
214 __asm__("BSRQ %[d_minus_1], %[l_minus_1]"
215 : [l_minus_1] "=r" (l_minus_1)
216 : [d_minus_1] "r" (d - 1));
217 #elif defined(__GNUC__)
218 const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
219 const uint32_t nlz_d = __builtin_clzll(d);
220 #else
221 /* Based on Algorithm 2 from Hacker's delight */
222 const uint64_t d_minus_1 = d - 1;
223 const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
224 uint64_t l_minus_1 = 0;
225 uint32_t x = d_minus_1;
226 uint32_t y = d_minus_1 >> 32;
227 if (y != 0) {
228 l_minus_1 += 32;
229 x = y;
230 }
231 y = x >> 16;
232 if (y != 0) {
233 l_minus_1 += 16;
234 x = y;
235 }
236 y = x >> 8;
237 if (y != 0) {
238 l_minus_1 += 8;
239 x = y;
240 }
241 y = x >> 4;
242 if (y != 0) {
243 l_minus_1 += 4;
244 x = y;
245 }
246 y = x >> 2;
247 if (y != 0) {
248 l_minus_1 += 2;
249 x = y;
250 }
251 if ((x & 2) != 0) {
252 l_minus_1 += 1;
253 }
254 const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
255 #endif
256 uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
257
258 /* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
259 #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
260 uint64_t q;
261 __asm__("DIVQ %[d]"
262 : "=a" (q), "+d" (u_hi)
263 : [d] "r" (d), "a" (UINT64_C(0)));
264 #else
265 /* Implementation based on code from Hacker's delight */
266
267 /* Normalize divisor and shift divident left */
268 d <<= nlz_d;
269 u_hi <<= nlz_d;
270 /* Break divisor up into two 32-bit digits */
271 const uint64_t d_hi = (uint32_t) (d >> 32);
272 const uint32_t d_lo = (uint32_t) d;
273
274 /* Compute the first quotient digit, q1 */
275 uint64_t q1 = u_hi / d_hi;
276 uint64_t r1 = u_hi - q1 * d_hi;
277
278 while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
279 q1 -= 1;
280 r1 += d_hi;
281 if ((r1 >> 32) != 0) {
282 break;
283 }
284 }
285
286 /* Multiply and subtract. */
287 u_hi = (u_hi << 32) - q1 * d;
288
289 /* Compute the second quotient digit, q0 */
290 uint64_t q0 = u_hi / d_hi;
291 uint64_t r0 = u_hi - q0 * d_hi;
292
293 while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
294 q0 -= 1;
295 r0 += d_hi;
296 if ((r0 >> 32) != 0) {
297 break;
298 }
299 }
300 const uint64_t q = (q1 << 32) | (uint32_t) q0;
301 #endif
302 result.m = q + UINT64_C(1);
303 result.s1 = 1;
304 result.s2 = (uint8_t) l_minus_1;
305 }
306 return result;
307 }
308
fxdiv_init_size_t(size_t d)309 static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
310 #if SIZE_MAX == UINT32_MAX
311 const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
312 #elif SIZE_MAX == UINT64_MAX
313 const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
314 #else
315 #error Unsupported platform
316 #endif
317 struct fxdiv_divisor_size_t size_result = {
318 (size_t) uint_result.value,
319 (size_t) uint_result.m,
320 uint_result.s1,
321 uint_result.s2
322 };
323 return size_result;
324 }
325
fxdiv_quotient_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)326 static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
327 const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
328 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
329 }
330
fxdiv_quotient_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)331 static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
332 const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
333 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
334 }
335
fxdiv_quotient_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)336 static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
337 #if SIZE_MAX == UINT32_MAX
338 const struct fxdiv_divisor_uint32_t uint32_divisor = {
339 (uint32_t) divisor.value,
340 (uint32_t) divisor.m,
341 divisor.s1,
342 divisor.s2
343 };
344 return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
345 #elif SIZE_MAX == UINT64_MAX
346 const struct fxdiv_divisor_uint64_t uint64_divisor = {
347 (uint64_t) divisor.value,
348 (uint64_t) divisor.m,
349 divisor.s1,
350 divisor.s2
351 };
352 return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
353 #else
354 #error Unsupported platform
355 #endif
356 }
357
fxdiv_remainder_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)358 static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
359 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
360 return n - quotient * divisor.value;
361 }
362
fxdiv_remainder_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)363 static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
364 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
365 return n - quotient * divisor.value;
366 }
367
fxdiv_remainder_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)368 static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
369 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
370 return n - quotient * divisor.value;
371 }
372
fxdiv_round_down_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t granularity)373 static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
374 const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
375 return quotient * granularity.value;
376 }
377
fxdiv_round_down_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t granularity)378 static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
379 const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
380 return quotient * granularity.value;
381 }
382
fxdiv_round_down_size_t(size_t n,const struct fxdiv_divisor_size_t granularity)383 static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
384 const size_t quotient = fxdiv_quotient_size_t(n, granularity);
385 return quotient * granularity.value;
386 }
387
fxdiv_divide_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)388 static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
389 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
390 const uint32_t remainder = n - quotient * divisor.value;
391 struct fxdiv_result_uint32_t result = { quotient, remainder };
392 return result;
393 }
394
fxdiv_divide_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)395 static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
396 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
397 const uint64_t remainder = n - quotient * divisor.value;
398 struct fxdiv_result_uint64_t result = { quotient, remainder };
399 return result;
400 }
401
fxdiv_divide_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)402 static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
403 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
404 const size_t remainder = n - quotient * divisor.value;
405 struct fxdiv_result_size_t result = { quotient, remainder };
406 return result;
407 }
408
409 #endif /* FXDIV_H */
410