• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #pragma once
2 #ifndef FXDIV_H
3 #define FXDIV_H
4 
5 #if defined(__cplusplus) && (__cplusplus >= 201103L)
6 	#include <cstddef>
7 	#include <cstdint>
8 	#include <climits>
9 #elif !defined(__OPENCL_VERSION__)
10 	#include <stddef.h>
11 	#include <stdint.h>
12 	#include <limits.h>
13 #endif
14 
15 #if defined(_MSC_VER)
16 	#include <intrin.h>
17 #endif
18 
19 #ifndef FXDIV_USE_INLINE_ASSEMBLY
20 	#define FXDIV_USE_INLINE_ASSEMBLY 1
21 #endif
22 
fxdiv_mulext_uint32_t(uint32_t a,uint32_t b)23 static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
24 #if defined(_MSC_VER) && defined(_M_IX86)
25 	return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
26 #else
27 	return (uint64_t) a * (uint64_t) b;
28 #endif
29 }
30 
fxdiv_mulhi_uint32_t(uint32_t a,uint32_t b)31 static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
32 #if defined(__OPENCL_VERSION__)
33 	return mul_hi(a, b);
34 #elif defined(__CUDA_ARCH__)
35 	return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
36 #elif defined(_MSC_VER) && defined(_M_IX86)
37 	return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
38 #elif defined(_MSC_VER) && defined(_M_ARM)
39 	return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
40 #else
41 	return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
42 #endif
43 }
44 
fxdiv_mulhi_uint64_t(uint64_t a,uint64_t b)45 static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
46 #if defined(__OPENCL_VERSION__)
47 	return mul_hi(a, b);
48 #elif defined(__CUDA_ARCH__)
49 	return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
50 #elif defined(_MSC_VER) && defined(_M_X64)
51 	return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
52 #elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
53 	return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
54 #else
55 	const uint32_t a_lo = (uint32_t) a;
56 	const uint32_t a_hi = (uint32_t) (a >> 32);
57 	const uint32_t b_lo = (uint32_t) b;
58 	const uint32_t b_hi = (uint32_t) (b >> 32);
59 
60 	const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
61 		(uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
62 	return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
63 		((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
64 #endif
65 }
66 
fxdiv_mulhi_size_t(size_t a,size_t b)67 static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
68 #if SIZE_MAX == UINT32_MAX
69 	return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
70 #elif SIZE_MAX == UINT64_MAX
71 	return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
72 #else
73 	#error Unsupported platform
74 #endif
75 }
76 
77 struct fxdiv_divisor_uint32_t {
78 	uint32_t value;
79 	uint32_t m;
80 	uint8_t s1;
81 	uint8_t s2;
82 };
83 
84 struct fxdiv_result_uint32_t {
85 	uint32_t quotient;
86 	uint32_t remainder;
87 };
88 
89 struct fxdiv_divisor_uint64_t {
90 	uint64_t value;
91 	uint64_t m;
92 	uint8_t s1;
93 	uint8_t s2;
94 };
95 
96 struct fxdiv_result_uint64_t {
97 	uint64_t quotient;
98 	uint64_t remainder;
99 };
100 
101 struct fxdiv_divisor_size_t {
102 	size_t value;
103 	size_t m;
104 	uint8_t s1;
105 	uint8_t s2;
106 };
107 
108 struct fxdiv_result_size_t {
109 	size_t quotient;
110 	size_t remainder;
111 };
112 
fxdiv_init_uint32_t(uint32_t d)113 static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
114 	struct fxdiv_divisor_uint32_t result = { d };
115 	if (d == 1) {
116 		result.m = UINT32_C(1);
117 		result.s1 = 0;
118 		result.s2 = 0;
119 	} else {
120 		#if defined(__OPENCL_VERSION__)
121 			const uint32_t l_minus_1 = 31 - clz(d - 1);
122 		#elif defined(__CUDA_ARCH__)
123 			const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
124 		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
125 			unsigned long l_minus_1;
126 			_BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
127 		#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
128 			uint32_t l_minus_1;
129 			__asm__("BSRL %[d_minus_1], %[l_minus_1]"
130 				: [l_minus_1] "=r" (l_minus_1)
131 				: [d_minus_1] "r" (d - 1));
132 		#elif defined(__GNUC__)
133 			const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
134 		#else
135 			/* Based on Algorithm 2 from Hacker's delight */
136 
137 			uint32_t l_minus_1 = 0;
138 			uint32_t x = d - 1;
139 			uint32_t y = x >> 16;
140 			if (y != 0) {
141 				l_minus_1 += 16;
142 				x = y;
143 			}
144 			y = x >> 8;
145 			if (y != 0) {
146 				l_minus_1 += 8;
147 				x = y;
148 			}
149 			y = x >> 4;
150 			if (y != 0) {
151 				l_minus_1 += 4;
152 				x = y;
153 			}
154 			y = x >> 2;
155 			if (y != 0) {
156 				l_minus_1 += 2;
157 				x = y;
158 			}
159 			if ((x & 2) != 0) {
160 				l_minus_1 += 1;
161 			}
162 		#endif
163 		uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
164 
165 		/* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
166 		#if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
167 			uint32_t q;
168 			__asm__("DIVL %[d]"
169 				: "=a" (q), "+d" (u_hi)
170 				: [d] "r" (d), "a" (0));
171 		#else
172 			const uint32_t q = ((uint64_t) u_hi << 32) / d;
173 		#endif
174 
175 		result.m = q + UINT32_C(1);
176 		result.s1 = 1;
177 		result.s2 = (uint8_t) l_minus_1;
178 	}
179 	return result;
180 }
181 
fxdiv_init_uint64_t(uint64_t d)182 static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
183 	struct fxdiv_divisor_uint64_t result = { d };
184 	if (d == 1) {
185 		result.m = UINT64_C(1);
186 		result.s1 = 0;
187 		result.s2 = 0;
188 	} else {
189 		#if defined(__OPENCL_VERSION__)
190 			const uint32_t nlz_d = clz(d);
191 			const uint32_t l_minus_1 = 63 - clz(d - 1);
192 		#elif defined(__CUDA_ARCH__)
193 			const uint32_t nlz_d = __clzll((long long) d);
194 			const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
195 		#elif defined(_MSC_VER) && defined(_M_X64)
196 			unsigned long l_minus_1;
197 			_BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
198 			unsigned long bsr_d;
199 			_BitScanReverse64(&bsr_d, (unsigned __int64) d);
200 			const uint32_t nlz_d = bsr_d ^ 0x3F;
201 		#elif defined(_MSC_VER) && defined(_M_IX86)
202 			const uint64_t d_minus_1 = d - 1;
203 			const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
204 			unsigned long l_minus_1;
205 			if ((uint32_t) (d_minus_1 >> 32) == 0) {
206 				_BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
207 			} else {
208 				_BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
209 				l_minus_1 += 32;
210 			}
211 			const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
212 		#elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
213 			uint64_t l_minus_1;
214 			__asm__("BSRQ %[d_minus_1], %[l_minus_1]"
215 				: [l_minus_1] "=r" (l_minus_1)
216 				: [d_minus_1] "r" (d - 1));
217 		#elif defined(__GNUC__)
218 			const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
219 			const uint32_t nlz_d = __builtin_clzll(d);
220 		#else
221 			/* Based on Algorithm 2 from Hacker's delight */
222 			const uint64_t d_minus_1 = d - 1;
223 			const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
224 			uint64_t l_minus_1 = 0;
225 			uint32_t x = d_minus_1;
226 			uint32_t y = d_minus_1 >> 32;
227 			if (y != 0) {
228 				l_minus_1 += 32;
229 				x = y;
230 			}
231 			y = x >> 16;
232 			if (y != 0) {
233 				l_minus_1 += 16;
234 				x = y;
235 			}
236 			y = x >> 8;
237 			if (y != 0) {
238 				l_minus_1 += 8;
239 				x = y;
240 			}
241 			y = x >> 4;
242 			if (y != 0) {
243 				l_minus_1 += 4;
244 				x = y;
245 			}
246 			y = x >> 2;
247 			if (y != 0) {
248 				l_minus_1 += 2;
249 				x = y;
250 			}
251 			if ((x & 2) != 0) {
252 				l_minus_1 += 1;
253 			}
254 			const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
255 		#endif
256 		uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
257 
258 		/* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
259 		#if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
260 			uint64_t q;
261 			__asm__("DIVQ %[d]"
262 				: "=a" (q), "+d" (u_hi)
263 				: [d] "r" (d), "a" (UINT64_C(0)));
264 		#else
265 			/* Implementation based on code from Hacker's delight */
266 
267 			/* Normalize divisor and shift divident left */
268 			d <<= nlz_d;
269 			u_hi <<= nlz_d;
270 			/* Break divisor up into two 32-bit digits */
271 			const uint64_t d_hi = (uint32_t) (d >> 32);
272 			const uint32_t d_lo = (uint32_t) d;
273 
274 			/* Compute the first quotient digit, q1 */
275 			uint64_t q1 = u_hi / d_hi;
276 			uint64_t r1 = u_hi - q1 * d_hi;
277 
278 			while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
279 				q1 -= 1;
280 				r1 += d_hi;
281 				if ((r1 >> 32) != 0) {
282 					break;
283 				}
284 			}
285 
286 			/* Multiply and subtract. */
287 			u_hi = (u_hi << 32) - q1 * d;
288 
289 			/* Compute the second quotient digit, q0 */
290 			uint64_t q0 = u_hi / d_hi;
291 			uint64_t r0 = u_hi - q0 * d_hi;
292 
293 			while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
294 				q0 -= 1;
295 				r0 += d_hi;
296 				if ((r0 >> 32) != 0) {
297 					break;
298 				}
299 			}
300 			const uint64_t q = (q1 << 32) | (uint32_t) q0;
301 		#endif
302 		result.m = q + UINT64_C(1);
303 		result.s1 = 1;
304 		result.s2 = (uint8_t) l_minus_1;
305 	}
306 	return result;
307 }
308 
fxdiv_init_size_t(size_t d)309 static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
310 #if SIZE_MAX == UINT32_MAX
311 	const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
312 #elif SIZE_MAX == UINT64_MAX
313 	const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
314 #else
315 	#error Unsupported platform
316 #endif
317 	struct fxdiv_divisor_size_t size_result = {
318 		(size_t) uint_result.value,
319 		(size_t) uint_result.m,
320 		uint_result.s1,
321 		uint_result.s2
322 	};
323 	return size_result;
324 }
325 
fxdiv_quotient_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)326 static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
327 	const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
328 	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
329 }
330 
fxdiv_quotient_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)331 static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
332 	const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
333 	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
334 }
335 
fxdiv_quotient_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)336 static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
337 #if SIZE_MAX == UINT32_MAX
338 	const struct fxdiv_divisor_uint32_t uint32_divisor = {
339 		(uint32_t) divisor.value,
340 		(uint32_t) divisor.m,
341 		divisor.s1,
342 		divisor.s2
343 	};
344 	return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
345 #elif SIZE_MAX == UINT64_MAX
346 	const struct fxdiv_divisor_uint64_t uint64_divisor = {
347 		(uint64_t) divisor.value,
348 		(uint64_t) divisor.m,
349 		divisor.s1,
350 		divisor.s2
351 	};
352 	return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
353 #else
354 	#error Unsupported platform
355 #endif
356 }
357 
fxdiv_remainder_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)358 static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
359 	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
360 	return n - quotient * divisor.value;
361 }
362 
fxdiv_remainder_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)363 static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
364 	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
365 	return n - quotient * divisor.value;
366 }
367 
fxdiv_remainder_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)368 static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
369 	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
370 	return n - quotient * divisor.value;
371 }
372 
fxdiv_round_down_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t granularity)373 static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
374 	const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
375 	return quotient * granularity.value;
376 }
377 
fxdiv_round_down_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t granularity)378 static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
379 	const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
380 	return quotient * granularity.value;
381 }
382 
fxdiv_round_down_size_t(size_t n,const struct fxdiv_divisor_size_t granularity)383 static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
384 	const size_t quotient = fxdiv_quotient_size_t(n, granularity);
385 	return quotient * granularity.value;
386 }
387 
fxdiv_divide_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)388 static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
389 	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
390 	const uint32_t remainder = n - quotient * divisor.value;
391 	struct fxdiv_result_uint32_t result = { quotient, remainder };
392 	return result;
393 }
394 
fxdiv_divide_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)395 static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
396 	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
397 	const uint64_t remainder = n - quotient * divisor.value;
398 	struct fxdiv_result_uint64_t result = { quotient, remainder };
399 	return result;
400 }
401 
fxdiv_divide_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)402 static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
403 	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
404 	const size_t remainder = n - quotient * divisor.value;
405 	struct fxdiv_result_size_t result = { quotient, remainder };
406 	return result;
407 }
408 
409 #endif /* FXDIV_H */
410