1 #pragma once
2 #ifndef FXDIV_H
3 #define FXDIV_H
4
5 #if defined(__cplusplus) && (__cplusplus >= 201103L)
6 #include <cstddef>
7 #include <cstdint>
8 #include <climits>
9 #elif !defined(__OPENCL_VERSION__)
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <limits.h>
13 #endif
14
15 #if defined(_MSC_VER)
16 #include <intrin.h>
17 #if defined(_M_IX86) || defined(_M_X64)
18 #include <immintrin.h>
19 #endif
20 #endif
21
22 #ifndef FXDIV_USE_INLINE_ASSEMBLY
23 #define FXDIV_USE_INLINE_ASSEMBLY 0
24 #endif
25
fxdiv_mulext_uint32_t(uint32_t a,uint32_t b)26 static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
27 #if defined(_MSC_VER) && defined(_M_IX86)
28 return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
29 #else
30 return (uint64_t) a * (uint64_t) b;
31 #endif
32 }
33
fxdiv_mulhi_uint32_t(uint32_t a,uint32_t b)34 static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
35 #if defined(__OPENCL_VERSION__)
36 return mul_hi(a, b);
37 #elif defined(__CUDA_ARCH__)
38 return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
39 #elif defined(_MSC_VER) && defined(_M_IX86)
40 return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
41 #elif defined(_MSC_VER) && defined(_M_ARM)
42 return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
43 #else
44 return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
45 #endif
46 }
47
fxdiv_mulhi_uint64_t(uint64_t a,uint64_t b)48 static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
49 #if defined(__OPENCL_VERSION__)
50 return mul_hi(a, b);
51 #elif defined(__CUDA_ARCH__)
52 return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
53 #elif defined(_MSC_VER) && defined(_M_X64)
54 return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
55 #elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
56 return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
57 #else
58 const uint32_t a_lo = (uint32_t) a;
59 const uint32_t a_hi = (uint32_t) (a >> 32);
60 const uint32_t b_lo = (uint32_t) b;
61 const uint32_t b_hi = (uint32_t) (b >> 32);
62
63 const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
64 (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
65 return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
66 ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
67 #endif
68 }
69
fxdiv_mulhi_size_t(size_t a,size_t b)70 static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
71 #if SIZE_MAX == UINT32_MAX
72 return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
73 #elif SIZE_MAX == UINT64_MAX
74 return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
75 #else
76 #error Unsupported platform
77 #endif
78 }
79
80 struct fxdiv_divisor_uint32_t {
81 uint32_t value;
82 uint32_t m;
83 uint8_t s1;
84 uint8_t s2;
85 };
86
87 struct fxdiv_result_uint32_t {
88 uint32_t quotient;
89 uint32_t remainder;
90 };
91
92 struct fxdiv_divisor_uint64_t {
93 uint64_t value;
94 uint64_t m;
95 uint8_t s1;
96 uint8_t s2;
97 };
98
99 struct fxdiv_result_uint64_t {
100 uint64_t quotient;
101 uint64_t remainder;
102 };
103
104 struct fxdiv_divisor_size_t {
105 size_t value;
106 size_t m;
107 uint8_t s1;
108 uint8_t s2;
109 };
110
111 struct fxdiv_result_size_t {
112 size_t quotient;
113 size_t remainder;
114 };
115
fxdiv_init_uint32_t(uint32_t d)116 static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
117 struct fxdiv_divisor_uint32_t result = { d };
118 if (d == 1) {
119 result.m = UINT32_C(1);
120 result.s1 = 0;
121 result.s2 = 0;
122 } else {
123 #if defined(__OPENCL_VERSION__)
124 const uint32_t l_minus_1 = 31 - clz(d - 1);
125 #elif defined(__CUDA_ARCH__)
126 const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
127 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
128 unsigned long l_minus_1;
129 _BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
130 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
131 uint32_t l_minus_1;
132 __asm__("BSRL %[d_minus_1], %[l_minus_1]"
133 : [l_minus_1] "=r" (l_minus_1)
134 : [d_minus_1] "r" (d - 1)
135 : "cc");
136 #elif defined(__GNUC__)
137 const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
138 #else
139 /* Based on Algorithm 2 from Hacker's delight */
140
141 uint32_t l_minus_1 = 0;
142 uint32_t x = d - 1;
143 uint32_t y = x >> 16;
144 if (y != 0) {
145 l_minus_1 += 16;
146 x = y;
147 }
148 y = x >> 8;
149 if (y != 0) {
150 l_minus_1 += 8;
151 x = y;
152 }
153 y = x >> 4;
154 if (y != 0) {
155 l_minus_1 += 4;
156 x = y;
157 }
158 y = x >> 2;
159 if (y != 0) {
160 l_minus_1 += 2;
161 x = y;
162 }
163 if ((x & 2) != 0) {
164 l_minus_1 += 1;
165 }
166 #endif
167 uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
168
169 /* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
170 #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
171 uint32_t q;
172 __asm__("DIVL %[d]"
173 : "=a" (q), "+d" (u_hi)
174 : [d] "r" (d), "a" (0)
175 : "cc");
176 #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
177 unsigned int remainder;
178 const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
179 #else
180 const uint32_t q = ((uint64_t) u_hi << 32) / d;
181 #endif
182
183 result.m = q + UINT32_C(1);
184 result.s1 = 1;
185 result.s2 = (uint8_t) l_minus_1;
186 }
187 return result;
188 }
189
fxdiv_init_uint64_t(uint64_t d)190 static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
191 struct fxdiv_divisor_uint64_t result = { d };
192 if (d == 1) {
193 result.m = UINT64_C(1);
194 result.s1 = 0;
195 result.s2 = 0;
196 } else {
197 #if defined(__OPENCL_VERSION__)
198 const uint32_t nlz_d = clz(d);
199 const uint32_t l_minus_1 = 63 - clz(d - 1);
200 #elif defined(__CUDA_ARCH__)
201 const uint32_t nlz_d = __clzll((long long) d);
202 const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
203 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
204 unsigned long l_minus_1;
205 _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
206 unsigned long bsr_d;
207 _BitScanReverse64(&bsr_d, (unsigned __int64) d);
208 const uint32_t nlz_d = bsr_d ^ 0x3F;
209 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
210 const uint64_t d_minus_1 = d - 1;
211 const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
212 unsigned long l_minus_1;
213 if ((uint32_t) (d_minus_1 >> 32) == 0) {
214 _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
215 } else {
216 _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
217 l_minus_1 += 32;
218 }
219 const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
220 #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
221 uint64_t l_minus_1;
222 __asm__("BSRQ %[d_minus_1], %[l_minus_1]"
223 : [l_minus_1] "=r" (l_minus_1)
224 : [d_minus_1] "r" (d - 1)
225 : "cc");
226 #elif defined(__GNUC__)
227 const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
228 const uint32_t nlz_d = __builtin_clzll(d);
229 #else
230 /* Based on Algorithm 2 from Hacker's delight */
231 const uint64_t d_minus_1 = d - 1;
232 const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
233 uint32_t l_minus_1 = 0;
234 uint32_t x = (uint32_t) d_minus_1;
235 uint32_t y = d_minus_1 >> 32;
236 if (y != 0) {
237 l_minus_1 += 32;
238 x = y;
239 }
240 y = x >> 16;
241 if (y != 0) {
242 l_minus_1 += 16;
243 x = y;
244 }
245 y = x >> 8;
246 if (y != 0) {
247 l_minus_1 += 8;
248 x = y;
249 }
250 y = x >> 4;
251 if (y != 0) {
252 l_minus_1 += 4;
253 x = y;
254 }
255 y = x >> 2;
256 if (y != 0) {
257 l_minus_1 += 2;
258 x = y;
259 }
260 if ((x & 2) != 0) {
261 l_minus_1 += 1;
262 }
263 const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
264 #endif
265 uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
266
267 /* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
268 #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
269 uint64_t q;
270 __asm__("DIVQ %[d]"
271 : "=a" (q), "+d" (u_hi)
272 : [d] "r" (d), "a" (UINT64_C(0))
273 : "cc");
274 #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
275 /* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
276 const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
277 #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
278 unsigned __int64 remainder;
279 const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
280 #else
281 /* Implementation based on code from Hacker's delight */
282
283 /* Normalize divisor and shift divident left */
284 d <<= nlz_d;
285 u_hi <<= nlz_d;
286 /* Break divisor up into two 32-bit digits */
287 const uint64_t d_hi = (uint32_t) (d >> 32);
288 const uint32_t d_lo = (uint32_t) d;
289
290 /* Compute the first quotient digit, q1 */
291 uint64_t q1 = u_hi / d_hi;
292 uint64_t r1 = u_hi - q1 * d_hi;
293
294 while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
295 q1 -= 1;
296 r1 += d_hi;
297 if ((r1 >> 32) != 0) {
298 break;
299 }
300 }
301
302 /* Multiply and subtract. */
303 u_hi = (u_hi << 32) - q1 * d;
304
305 /* Compute the second quotient digit, q0 */
306 uint64_t q0 = u_hi / d_hi;
307 uint64_t r0 = u_hi - q0 * d_hi;
308
309 while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
310 q0 -= 1;
311 r0 += d_hi;
312 if ((r0 >> 32) != 0) {
313 break;
314 }
315 }
316 const uint64_t q = (q1 << 32) | (uint32_t) q0;
317 #endif
318 result.m = q + UINT64_C(1);
319 result.s1 = 1;
320 result.s2 = (uint8_t) l_minus_1;
321 }
322 return result;
323 }
324
fxdiv_init_size_t(size_t d)325 static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
326 #if SIZE_MAX == UINT32_MAX
327 const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
328 #elif SIZE_MAX == UINT64_MAX
329 const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
330 #else
331 #error Unsupported platform
332 #endif
333 struct fxdiv_divisor_size_t size_result = {
334 (size_t) uint_result.value,
335 (size_t) uint_result.m,
336 uint_result.s1,
337 uint_result.s2
338 };
339 return size_result;
340 }
341
fxdiv_quotient_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)342 static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
343 const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
344 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
345 }
346
fxdiv_quotient_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)347 static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
348 const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
349 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
350 }
351
fxdiv_quotient_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)352 static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
353 #if SIZE_MAX == UINT32_MAX
354 const struct fxdiv_divisor_uint32_t uint32_divisor = {
355 (uint32_t) divisor.value,
356 (uint32_t) divisor.m,
357 divisor.s1,
358 divisor.s2
359 };
360 return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
361 #elif SIZE_MAX == UINT64_MAX
362 const struct fxdiv_divisor_uint64_t uint64_divisor = {
363 (uint64_t) divisor.value,
364 (uint64_t) divisor.m,
365 divisor.s1,
366 divisor.s2
367 };
368 return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
369 #else
370 #error Unsupported platform
371 #endif
372 }
373
fxdiv_remainder_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)374 static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
375 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
376 return n - quotient * divisor.value;
377 }
378
fxdiv_remainder_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)379 static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
380 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
381 return n - quotient * divisor.value;
382 }
383
fxdiv_remainder_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)384 static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
385 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
386 return n - quotient * divisor.value;
387 }
388
fxdiv_round_down_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t granularity)389 static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
390 const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
391 return quotient * granularity.value;
392 }
393
fxdiv_round_down_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t granularity)394 static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
395 const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
396 return quotient * granularity.value;
397 }
398
fxdiv_round_down_size_t(size_t n,const struct fxdiv_divisor_size_t granularity)399 static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
400 const size_t quotient = fxdiv_quotient_size_t(n, granularity);
401 return quotient * granularity.value;
402 }
403
fxdiv_divide_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)404 static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
405 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
406 const uint32_t remainder = n - quotient * divisor.value;
407 struct fxdiv_result_uint32_t result = { quotient, remainder };
408 return result;
409 }
410
fxdiv_divide_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)411 static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
412 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
413 const uint64_t remainder = n - quotient * divisor.value;
414 struct fxdiv_result_uint64_t result = { quotient, remainder };
415 return result;
416 }
417
fxdiv_divide_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)418 static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
419 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
420 const size_t remainder = n - quotient * divisor.value;
421 struct fxdiv_result_size_t result = { quotient, remainder };
422 return result;
423 }
424
425 #endif /* FXDIV_H */
426