• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2008 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 #ifndef BITSCAN_H
30 #define BITSCAN_H
31 
32 #include <assert.h>
33 #include <stdint.h>
34 #include <stdbool.h>
35 #include <string.h>
36 
37 #if defined(_MSC_VER)
38 #include <intrin.h>
39 #endif
40 
41 #if defined(__POPCNT__)
42 #include <popcntintrin.h>
43 #endif
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 
50 /**
51  * Find first bit set in word.  Least significant bit is 1.
52  * Return 0 if no bits set.
53  */
54 #ifdef HAVE___BUILTIN_FFS
55 #define ffs __builtin_ffs
56 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
57 static inline
58 int ffs(int i)
59 {
60    unsigned long index;
61    if (_BitScanForward(&index, i))
62       return index + 1;
63    else
64       return 0;
65 }
66 #else
67 extern
68 int ffs(int i);
69 #endif
70 
71 #ifdef HAVE___BUILTIN_FFSLL
72 #define ffsll __builtin_ffsll
73 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
74 static inline int
75 ffsll(long long int i)
76 {
77    unsigned long index;
78    if (_BitScanForward64(&index, i))
79       return index + 1;
80    else
81       return 0;
82 }
83 #else
84 extern int
85 ffsll(long long int val);
86 #endif
87 
88 
89 /* Destructively loop over all of the bits in a mask as in:
90  *
91  * while (mymask) {
92  *   int i = u_bit_scan(&mymask);
93  *   ... process element i
94  * }
95  *
96  */
97 static inline int
u_bit_scan(unsigned * mask)98 u_bit_scan(unsigned *mask)
99 {
100    const int i = ffs(*mask) - 1;
101    *mask ^= (1u << i);
102    return i;
103 }
104 
105 #define u_foreach_bit(b, dword)                          \
106    for (uint32_t __dword = (dword), b;                     \
107         ((b) = ffs(__dword) - 1, __dword);      \
108         __dword &= ~(1 << (b)))
109 
110 static inline int
u_bit_scan64(uint64_t * mask)111 u_bit_scan64(uint64_t *mask)
112 {
113    const int i = ffsll(*mask) - 1;
114    *mask ^= (((uint64_t)1) << i);
115    return i;
116 }
117 
118 #define u_foreach_bit64(b, dword)                          \
119    for (uint64_t __dword = (dword), b;                     \
120         ((b) = ffsll(__dword) - 1, __dword);      \
121         __dword &= ~(1ull << (b)))
122 
123 /* Determine if an unsigned value is a power of two.
124  *
125  * \note
126  * Zero is treated as a power of two.
127  */
128 static inline bool
util_is_power_of_two_or_zero(unsigned v)129 util_is_power_of_two_or_zero(unsigned v)
130 {
131    return (v & (v - 1)) == 0;
132 }
133 
134 /* Determine if an uint64_t value is a power of two.
135  *
136  * \note
137  * Zero is treated as a power of two.
138  */
139 static inline bool
util_is_power_of_two_or_zero64(uint64_t v)140 util_is_power_of_two_or_zero64(uint64_t v)
141 {
142    return (v & (v - 1)) == 0;
143 }
144 
145 /* Determine if an unsigned value is a power of two.
146  *
147  * \note
148  * Zero is \b not treated as a power of two.
149  */
150 static inline bool
util_is_power_of_two_nonzero(unsigned v)151 util_is_power_of_two_nonzero(unsigned v)
152 {
153    /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT.  The latter
154     * indicates the existence of the __builtin_popcount function.  The former
155     * indicates that _mm_popcnt_u32 exists and is a native instruction.
156     *
157     * The other alternative is to use SSE 4.2 compile-time flags.  This has
158     * two drawbacks.  First, there is currently no build infrastructure for
159     * SSE 4.2 (only 4.1), so that would have to be added.  Second, some AMD
160     * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
161     */
162 #ifdef __POPCNT__
163    return _mm_popcnt_u32(v) == 1;
164 #else
165    return v != 0 && (v & (v - 1)) == 0;
166 #endif
167 }
168 
169 /* For looping over a bitmask when you want to loop over consecutive bits
170  * manually, for example:
171  *
172  * while (mask) {
173  *    int start, count, i;
174  *
175  *    u_bit_scan_consecutive_range(&mask, &start, &count);
176  *
177  *    for (i = 0; i < count; i++)
178  *       ... process element (start+i)
179  * }
180  */
181 static inline void
u_bit_scan_consecutive_range(unsigned * mask,int * start,int * count)182 u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
183 {
184    if (*mask == 0xffffffff) {
185       *start = 0;
186       *count = 32;
187       *mask = 0;
188       return;
189    }
190    *start = ffs(*mask) - 1;
191    *count = ffs(~(*mask >> *start)) - 1;
192    *mask &= ~(((1u << *count) - 1) << *start);
193 }
194 
195 static inline void
u_bit_scan_consecutive_range64(uint64_t * mask,int * start,int * count)196 u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
197 {
198    if (*mask == ~0ull) {
199       *start = 0;
200       *count = 64;
201       *mask = 0;
202       return;
203    }
204    *start = ffsll(*mask) - 1;
205    *count = ffsll(~(*mask >> *start)) - 1;
206    *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
207 }
208 
209 
210 /**
211  * Find last bit set in a word.  The least significant bit is 1.
212  * Return 0 if no bits are set.
213  * Essentially ffs() in the reverse direction.
214  */
215 static inline unsigned
util_last_bit(unsigned u)216 util_last_bit(unsigned u)
217 {
218 #if defined(HAVE___BUILTIN_CLZ)
219    return u == 0 ? 0 : 32 - __builtin_clz(u);
220 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
221    unsigned long index;
222    if (_BitScanReverse(&index, u))
223       return index + 1;
224    else
225       return 0;
226 #else
227    unsigned r = 0;
228    while (u) {
229       r++;
230       u >>= 1;
231    }
232    return r;
233 #endif
234 }
235 
236 /**
237  * Find last bit set in a word.  The least significant bit is 1.
238  * Return 0 if no bits are set.
239  * Essentially ffsll() in the reverse direction.
240  */
241 static inline unsigned
util_last_bit64(uint64_t u)242 util_last_bit64(uint64_t u)
243 {
244 #if defined(HAVE___BUILTIN_CLZLL)
245    return u == 0 ? 0 : 64 - __builtin_clzll(u);
246 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
247    unsigned long index;
248    if (_BitScanReverse64(&index, u))
249       return index + 1;
250    else
251       return 0;
252 #else
253    unsigned r = 0;
254    while (u) {
255       r++;
256       u >>= 1;
257    }
258    return r;
259 #endif
260 }
261 
262 /**
263  * Find last bit in a word that does not match the sign bit. The least
264  * significant bit is 1.
265  * Return 0 if no bits are set.
266  */
267 static inline unsigned
util_last_bit_signed(int i)268 util_last_bit_signed(int i)
269 {
270    if (i >= 0)
271       return util_last_bit(i);
272    else
273       return util_last_bit(~(unsigned)i);
274 }
275 
276 /* Returns a bitfield in which the first count bits starting at start are
277  * set.
278  */
279 static inline unsigned
u_bit_consecutive(unsigned start,unsigned count)280 u_bit_consecutive(unsigned start, unsigned count)
281 {
282    assert(start + count <= 32);
283    if (count == 32)
284       return ~0;
285    return ((1u << count) - 1) << start;
286 }
287 
288 static inline uint64_t
u_bit_consecutive64(unsigned start,unsigned count)289 u_bit_consecutive64(unsigned start, unsigned count)
290 {
291    assert(start + count <= 64);
292    if (count == 64)
293       return ~(uint64_t)0;
294    return (((uint64_t)1 << count) - 1) << start;
295 }
296 
297 /**
298  * Return number of bits set in n.
299  */
300 static inline unsigned
util_bitcount(unsigned n)301 util_bitcount(unsigned n)
302 {
303 #if defined(HAVE___BUILTIN_POPCOUNT)
304    return __builtin_popcount(n);
305 #else
306    /* K&R classic bitcount.
307     *
308     * For each iteration, clear the LSB from the bitfield.
309     * Requires only one iteration per set bit, instead of
310     * one iteration per bit less than highest set bit.
311     */
312    unsigned bits;
313    for (bits = 0; n; bits++) {
314       n &= n - 1;
315    }
316    return bits;
317 #endif
318 }
319 
320 /**
321  * Return the number of bits set in n using the native popcnt instruction.
322  * The caller is responsible for ensuring that popcnt is supported by the CPU.
323  *
324  * gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing.
325  *
326  */
327 static inline unsigned
util_popcnt_inline_asm(unsigned n)328 util_popcnt_inline_asm(unsigned n)
329 {
330 #if defined(USE_X86_64_ASM) || defined(USE_X86_ASM)
331    uint32_t out;
332    __asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n));
333    return out;
334 #else
335    /* We should never get here by accident, but I'm sure it'll happen. */
336    return util_bitcount(n);
337 #endif
338 }
339 
340 static inline unsigned
util_bitcount64(uint64_t n)341 util_bitcount64(uint64_t n)
342 {
343 #ifdef HAVE___BUILTIN_POPCOUNTLL
344    return __builtin_popcountll(n);
345 #else
346    return util_bitcount(n) + util_bitcount(n >> 32);
347 #endif
348 }
349 
350 /**
351  * Widens the given bit mask by a multiplier, meaning that it will
352  * replicate each bit by that amount.
353  *
354  * For example:
355  * 0b101 widened by 2 will become: 0b110011
356  *
357  * This is typically used in shader I/O to transform a 64-bit
358  * writemask to a 32-bit writemask.
359  */
360 static inline uint32_t
util_widen_mask(uint32_t mask,unsigned multiplier)361 util_widen_mask(uint32_t mask, unsigned multiplier)
362 {
363    uint32_t new_mask = 0;
364    u_foreach_bit(i, mask)
365       new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
366    return new_mask;
367 }
368 
369 #ifdef __cplusplus
370 }
371 
372 /* util_bitcount has large measurable overhead (~2%), so it's recommended to
373  * use the POPCNT instruction via inline assembly if the CPU supports it.
374  */
375 enum util_popcnt {
376    POPCNT_NO,
377    POPCNT_YES,
378 };
379 
380 /* Convenient function to select popcnt through a C++ template argument.
381  * This should be used as part of larger functions that are optimized
382  * as a whole.
383  */
384 template<util_popcnt POPCNT> inline unsigned
util_bitcount_fast(unsigned n)385 util_bitcount_fast(unsigned n)
386 {
387    if (POPCNT == POPCNT_YES)
388       return util_popcnt_inline_asm(n);
389    else
390       return util_bitcount(n);
391 }
392 
393 #endif /* __cplusplus */
394 
395 #endif /* BITSCAN_H */
396