1 /**************************************************************************
2 *
3 * Copyright 2008 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #ifndef BITSCAN_H
30 #define BITSCAN_H
31
32 #include <assert.h>
33 #include <stdint.h>
34 #include <stdbool.h>
35 #include <string.h>
36
37 #if defined(_MSC_VER)
38 #include <intrin.h>
39 #endif
40
41 #if defined(__POPCNT__)
42 #include <popcntintrin.h>
43 #endif
44
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48
49
50 /**
51 * Find first bit set in word. Least significant bit is 1.
52 * Return 0 if no bits set.
53 */
54 #ifdef HAVE___BUILTIN_FFS
55 #define ffs __builtin_ffs
56 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
57 static inline
58 int ffs(int i)
59 {
60 unsigned long index;
61 if (_BitScanForward(&index, i))
62 return index + 1;
63 else
64 return 0;
65 }
66 #else
67 extern
68 int ffs(int i);
69 #endif
70
71 #ifdef HAVE___BUILTIN_FFSLL
72 #define ffsll __builtin_ffsll
73 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
74 static inline int
75 ffsll(long long int i)
76 {
77 unsigned long index;
78 if (_BitScanForward64(&index, i))
79 return index + 1;
80 else
81 return 0;
82 }
83 #else
84 extern int
85 ffsll(long long int val);
86 #endif
87
88
89 /* Destructively loop over all of the bits in a mask as in:
90 *
91 * while (mymask) {
92 * int i = u_bit_scan(&mymask);
93 * ... process element i
94 * }
95 *
96 */
97 static inline int
u_bit_scan(unsigned * mask)98 u_bit_scan(unsigned *mask)
99 {
100 const int i = ffs(*mask) - 1;
101 *mask ^= (1u << i);
102 return i;
103 }
104
105 #define u_foreach_bit(b, dword) \
106 for (uint32_t __dword = (dword), b; \
107 ((b) = ffs(__dword) - 1, __dword); \
108 __dword &= ~(1 << (b)))
109
110 static inline int
u_bit_scan64(uint64_t * mask)111 u_bit_scan64(uint64_t *mask)
112 {
113 const int i = ffsll(*mask) - 1;
114 *mask ^= (((uint64_t)1) << i);
115 return i;
116 }
117
118 #define u_foreach_bit64(b, dword) \
119 for (uint64_t __dword = (dword), b; \
120 ((b) = ffsll(__dword) - 1, __dword); \
121 __dword &= ~(1ull << (b)))
122
123 /* Determine if an unsigned value is a power of two.
124 *
125 * \note
126 * Zero is treated as a power of two.
127 */
128 static inline bool
util_is_power_of_two_or_zero(unsigned v)129 util_is_power_of_two_or_zero(unsigned v)
130 {
131 return (v & (v - 1)) == 0;
132 }
133
134 /* Determine if an uint64_t value is a power of two.
135 *
136 * \note
137 * Zero is treated as a power of two.
138 */
139 static inline bool
util_is_power_of_two_or_zero64(uint64_t v)140 util_is_power_of_two_or_zero64(uint64_t v)
141 {
142 return (v & (v - 1)) == 0;
143 }
144
145 /* Determine if an unsigned value is a power of two.
146 *
147 * \note
148 * Zero is \b not treated as a power of two.
149 */
150 static inline bool
util_is_power_of_two_nonzero(unsigned v)151 util_is_power_of_two_nonzero(unsigned v)
152 {
153 /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT. The latter
154 * indicates the existence of the __builtin_popcount function. The former
155 * indicates that _mm_popcnt_u32 exists and is a native instruction.
156 *
157 * The other alternative is to use SSE 4.2 compile-time flags. This has
158 * two drawbacks. First, there is currently no build infrastructure for
159 * SSE 4.2 (only 4.1), so that would have to be added. Second, some AMD
160 * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
161 */
162 #ifdef __POPCNT__
163 return _mm_popcnt_u32(v) == 1;
164 #else
165 return v != 0 && (v & (v - 1)) == 0;
166 #endif
167 }
168
169 /* For looping over a bitmask when you want to loop over consecutive bits
170 * manually, for example:
171 *
172 * while (mask) {
173 * int start, count, i;
174 *
175 * u_bit_scan_consecutive_range(&mask, &start, &count);
176 *
177 * for (i = 0; i < count; i++)
178 * ... process element (start+i)
179 * }
180 */
181 static inline void
u_bit_scan_consecutive_range(unsigned * mask,int * start,int * count)182 u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
183 {
184 if (*mask == 0xffffffff) {
185 *start = 0;
186 *count = 32;
187 *mask = 0;
188 return;
189 }
190 *start = ffs(*mask) - 1;
191 *count = ffs(~(*mask >> *start)) - 1;
192 *mask &= ~(((1u << *count) - 1) << *start);
193 }
194
195 static inline void
u_bit_scan_consecutive_range64(uint64_t * mask,int * start,int * count)196 u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
197 {
198 if (*mask == ~0ull) {
199 *start = 0;
200 *count = 64;
201 *mask = 0;
202 return;
203 }
204 *start = ffsll(*mask) - 1;
205 *count = ffsll(~(*mask >> *start)) - 1;
206 *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
207 }
208
209
210 /**
211 * Find last bit set in a word. The least significant bit is 1.
212 * Return 0 if no bits are set.
213 * Essentially ffs() in the reverse direction.
214 */
215 static inline unsigned
util_last_bit(unsigned u)216 util_last_bit(unsigned u)
217 {
218 #if defined(HAVE___BUILTIN_CLZ)
219 return u == 0 ? 0 : 32 - __builtin_clz(u);
220 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
221 unsigned long index;
222 if (_BitScanReverse(&index, u))
223 return index + 1;
224 else
225 return 0;
226 #else
227 unsigned r = 0;
228 while (u) {
229 r++;
230 u >>= 1;
231 }
232 return r;
233 #endif
234 }
235
236 /**
237 * Find last bit set in a word. The least significant bit is 1.
238 * Return 0 if no bits are set.
239 * Essentially ffsll() in the reverse direction.
240 */
241 static inline unsigned
util_last_bit64(uint64_t u)242 util_last_bit64(uint64_t u)
243 {
244 #if defined(HAVE___BUILTIN_CLZLL)
245 return u == 0 ? 0 : 64 - __builtin_clzll(u);
246 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
247 unsigned long index;
248 if (_BitScanReverse64(&index, u))
249 return index + 1;
250 else
251 return 0;
252 #else
253 unsigned r = 0;
254 while (u) {
255 r++;
256 u >>= 1;
257 }
258 return r;
259 #endif
260 }
261
262 /**
263 * Find last bit in a word that does not match the sign bit. The least
264 * significant bit is 1.
265 * Return 0 if no bits are set.
266 */
267 static inline unsigned
util_last_bit_signed(int i)268 util_last_bit_signed(int i)
269 {
270 if (i >= 0)
271 return util_last_bit(i);
272 else
273 return util_last_bit(~(unsigned)i);
274 }
275
276 /* Returns a bitfield in which the first count bits starting at start are
277 * set.
278 */
279 static inline unsigned
u_bit_consecutive(unsigned start,unsigned count)280 u_bit_consecutive(unsigned start, unsigned count)
281 {
282 assert(start + count <= 32);
283 if (count == 32)
284 return ~0;
285 return ((1u << count) - 1) << start;
286 }
287
288 static inline uint64_t
u_bit_consecutive64(unsigned start,unsigned count)289 u_bit_consecutive64(unsigned start, unsigned count)
290 {
291 assert(start + count <= 64);
292 if (count == 64)
293 return ~(uint64_t)0;
294 return (((uint64_t)1 << count) - 1) << start;
295 }
296
297 /**
298 * Return number of bits set in n.
299 */
300 static inline unsigned
util_bitcount(unsigned n)301 util_bitcount(unsigned n)
302 {
303 #if defined(HAVE___BUILTIN_POPCOUNT)
304 return __builtin_popcount(n);
305 #else
306 /* K&R classic bitcount.
307 *
308 * For each iteration, clear the LSB from the bitfield.
309 * Requires only one iteration per set bit, instead of
310 * one iteration per bit less than highest set bit.
311 */
312 unsigned bits;
313 for (bits = 0; n; bits++) {
314 n &= n - 1;
315 }
316 return bits;
317 #endif
318 }
319
320 /**
321 * Return the number of bits set in n using the native popcnt instruction.
322 * The caller is responsible for ensuring that popcnt is supported by the CPU.
323 *
324 * gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing.
325 *
326 */
327 static inline unsigned
util_popcnt_inline_asm(unsigned n)328 util_popcnt_inline_asm(unsigned n)
329 {
330 #if defined(USE_X86_64_ASM) || defined(USE_X86_ASM)
331 uint32_t out;
332 __asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n));
333 return out;
334 #else
335 /* We should never get here by accident, but I'm sure it'll happen. */
336 return util_bitcount(n);
337 #endif
338 }
339
340 static inline unsigned
util_bitcount64(uint64_t n)341 util_bitcount64(uint64_t n)
342 {
343 #ifdef HAVE___BUILTIN_POPCOUNTLL
344 return __builtin_popcountll(n);
345 #else
346 return util_bitcount(n) + util_bitcount(n >> 32);
347 #endif
348 }
349
350 /**
351 * Widens the given bit mask by a multiplier, meaning that it will
352 * replicate each bit by that amount.
353 *
354 * For example:
355 * 0b101 widened by 2 will become: 0b110011
356 *
357 * This is typically used in shader I/O to transform a 64-bit
358 * writemask to a 32-bit writemask.
359 */
360 static inline uint32_t
util_widen_mask(uint32_t mask,unsigned multiplier)361 util_widen_mask(uint32_t mask, unsigned multiplier)
362 {
363 uint32_t new_mask = 0;
364 u_foreach_bit(i, mask)
365 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
366 return new_mask;
367 }
368
369 #ifdef __cplusplus
370 }
371
372 /* util_bitcount has large measurable overhead (~2%), so it's recommended to
373 * use the POPCNT instruction via inline assembly if the CPU supports it.
374 */
375 enum util_popcnt {
376 POPCNT_NO,
377 POPCNT_YES,
378 };
379
380 /* Convenient function to select popcnt through a C++ template argument.
381 * This should be used as part of larger functions that are optimized
382 * as a whole.
383 */
384 template<util_popcnt POPCNT> inline unsigned
util_bitcount_fast(unsigned n)385 util_bitcount_fast(unsigned n)
386 {
387 if (POPCNT == POPCNT_YES)
388 return util_popcnt_inline_asm(n);
389 else
390 return util_bitcount(n);
391 }
392
393 #endif /* __cplusplus */
394
395 #endif /* BITSCAN_H */
396